From e25c81551bc57313c0df732c1ccb83c5398a1cf0 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 11 Nov 2025 17:14:01 +0000 Subject: [PATCH 01/64] [X86] Add 128-bit vector test coverage for #167498 (#167531) --- llvm/test/CodeGen/X86/build-vector-128.ll | 466 ++++++++++++++++++++++ 1 file changed, 466 insertions(+) diff --git a/llvm/test/CodeGen/X86/build-vector-128.ll b/llvm/test/CodeGen/X86/build-vector-128.ll index e2db8d4241420..b8bb417e1860c 100644 --- a/llvm/test/CodeGen/X86/build-vector-128.ll +++ b/llvm/test/CodeGen/X86/build-vector-128.ll @@ -410,6 +410,472 @@ define <16 x i8> @test_buildvector_v16i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, ret <16 x i8> %ins15 } +; build vectors where integers operands are split (typically via legalization) + +define <4 x i32> @test_buildvector_v2i64_split_v4i32(i64 %a0, i64 %a1) nounwind { +; SSE-32-LABEL: test_buildvector_v2i64_split_v4i32: +; SSE-32: # %bb.0: +; SSE-32-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; SSE-32-NEXT: retl +; +; SSE2-64-LABEL: test_buildvector_v2i64_split_v4i32: +; SSE2-64: # %bb.0: +; SSE2-64-NEXT: movl %edi, %eax +; SSE2-64-NEXT: movl %esi, %ecx +; SSE2-64-NEXT: shrq $32, %rdi +; SSE2-64-NEXT: shrq $32, %rsi +; SSE2-64-NEXT: movd %ecx, %xmm1 +; SSE2-64-NEXT: movd %esi, %xmm0 +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-64-NEXT: movd %eax, %xmm0 +; SSE2-64-NEXT: movd %edi, %xmm2 +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-64-NEXT: retq +; +; SSE41-64-LABEL: test_buildvector_v2i64_split_v4i32: +; SSE41-64: # %bb.0: +; SSE41-64-NEXT: movl %edi, %eax +; SSE41-64-NEXT: movl %esi, %ecx +; SSE41-64-NEXT: shrq $32, %rdi +; SSE41-64-NEXT: shrq $32, %rsi +; SSE41-64-NEXT: movd %eax, %xmm0 +; SSE41-64-NEXT: pinsrd $1, %edi, %xmm0 +; SSE41-64-NEXT: pinsrd $2, %ecx, %xmm0 +; SSE41-64-NEXT: pinsrd $3, %esi, %xmm0 +; SSE41-64-NEXT: retq +; +; AVX-32-LABEL: test_buildvector_v2i64_split_v4i32: +; AVX-32: # %bb.0: +; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: test_buildvector_v2i64_split_v4i32: +; AVX-64: # %bb.0: +; AVX-64-NEXT: movl %edi, %eax +; AVX-64-NEXT: movl %esi, %ecx +; AVX-64-NEXT: shrq $32, %rdi +; AVX-64-NEXT: shrq $32, %rsi +; AVX-64-NEXT: vmovd %eax, %xmm0 +; AVX-64-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 +; AVX-64-NEXT: retq + %a0.lo = trunc i64 %a0 to i32 + %a1.lo = trunc i64 %a1 to i32 + %a0.shr = lshr i64 %a0, 32 + %a1.shr = lshr i64 %a1, 32 + %a0.hi = trunc i64 %a0.shr to i32 + %a1.hi = trunc i64 %a1.shr to i32 + %v0 = insertelement <4 x i32> poison, i32 %a0.lo, i64 0 + %v1 = insertelement <4 x i32> %v0, i32 %a0.hi, i64 1 + %v2 = insertelement <4 x i32> %v1, i32 %a1.lo, i64 2 + %v3 = insertelement <4 x i32> %v2, i32 %a1.hi, i64 3 + ret <4 x i32> %v3 +} + +define <8 x i16> @test_buildvector_v4i32_split_v8i16(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind { +; SSE2-32-LABEL: test_buildvector_v4i32_split_v8i16: +; SSE2-32: # %bb.0: +; SSE2-32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE2-32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-32-NEXT: retl +; +; SSE2-64-LABEL: test_buildvector_v4i32_split_v8i16: +; SSE2-64: # %bb.0: +; SSE2-64-NEXT: movd %ecx, %xmm0 +; SSE2-64-NEXT: movd %edx, %xmm1 +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-64-NEXT: movd %esi, %xmm2 +; SSE2-64-NEXT: movd %edi, %xmm0 +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-64-NEXT: retq +; +; SSE41-32-LABEL: test_buildvector_v4i32_split_v8i16: +; SSE41-32: # %bb.0: +; SSE41-32-NEXT: pushl %esi +; SSE41-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE41-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; SSE41-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; SSE41-32-NEXT: movd %esi, %xmm0 +; SSE41-32-NEXT: shrl $16, %esi +; SSE41-32-NEXT: pinsrw $1, %esi, %xmm0 +; SSE41-32-NEXT: pinsrw $2, %edx, %xmm0 +; SSE41-32-NEXT: shrl $16, %edx +; SSE41-32-NEXT: pinsrw $3, %edx, %xmm0 +; SSE41-32-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE41-32-NEXT: shrl $16, %ecx +; SSE41-32-NEXT: pinsrw $5, %ecx, %xmm0 +; SSE41-32-NEXT: pinsrw $6, %eax, %xmm0 +; SSE41-32-NEXT: shrl $16, %eax +; SSE41-32-NEXT: pinsrw $7, %eax, %xmm0 +; SSE41-32-NEXT: popl %esi +; SSE41-32-NEXT: retl +; +; SSE41-64-LABEL: test_buildvector_v4i32_split_v8i16: +; SSE41-64: # %bb.0: +; SSE41-64-NEXT: movd %edi, %xmm0 +; SSE41-64-NEXT: shrl $16, %edi +; SSE41-64-NEXT: pinsrw $1, %edi, %xmm0 +; SSE41-64-NEXT: pinsrw $2, %esi, %xmm0 +; SSE41-64-NEXT: shrl $16, %esi +; SSE41-64-NEXT: pinsrw $3, %esi, %xmm0 +; SSE41-64-NEXT: pinsrw $4, %edx, %xmm0 +; SSE41-64-NEXT: shrl $16, %edx +; SSE41-64-NEXT: pinsrw $5, %edx, %xmm0 +; SSE41-64-NEXT: pinsrw $6, %ecx, %xmm0 +; SSE41-64-NEXT: shrl $16, %ecx +; SSE41-64-NEXT: pinsrw $7, %ecx, %xmm0 +; SSE41-64-NEXT: retq +; +; AVX-32-LABEL: test_buildvector_v4i32_split_v8i16: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %esi +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX-32-NEXT: vmovd %esi, %xmm0 +; AVX-32-NEXT: shrl $16, %esi +; AVX-32-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $16, %edx +; AVX-32-NEXT: vpinsrw $3, %edx, %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $16, %ecx +; AVX-32-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $16, %eax +; AVX-32-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: popl %esi +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: test_buildvector_v4i32_split_v8i16: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vmovd %edi, %xmm0 +; AVX-64-NEXT: shrl $16, %edi +; AVX-64-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrw $2, %esi, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $16, %esi +; AVX-64-NEXT: vpinsrw $3, %esi, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $16, %edx +; AVX-64-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $16, %ecx +; AVX-64-NEXT: vpinsrw $7, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: retq + %a0.lo = trunc i32 %a0 to i16 + %a1.lo = trunc i32 %a1 to i16 + %a2.lo = trunc i32 %a2 to i16 + %a3.lo = trunc i32 %a3 to i16 + %a0.shr = lshr i32 %a0, 16 + %a1.shr = lshr i32 %a1, 16 + %a2.shr = lshr i32 %a2, 16 + %a3.shr = lshr i32 %a3, 16 + %a0.hi = trunc i32 %a0.shr to i16 + %a1.hi = trunc i32 %a1.shr to i16 + %a2.hi = trunc i32 %a2.shr to i16 + %a3.hi = trunc i32 %a3.shr to i16 + %v0 = insertelement <8 x i16> poison, i16 %a0.lo, i64 0 + %v1 = insertelement <8 x i16> %v0, i16 %a0.hi, i64 1 + %v2 = insertelement <8 x i16> %v1, i16 %a1.lo, i64 2 + %v3 = insertelement <8 x i16> %v2, i16 %a1.hi, i64 3 + %v4 = insertelement <8 x i16> %v3, i16 %a2.lo, i64 4 + %v5 = insertelement <8 x i16> %v4, i16 %a2.hi, i64 5 + %v6 = insertelement <8 x i16> %v5, i16 %a3.lo, i64 6 + %v7 = insertelement <8 x i16> %v6, i16 %a3.hi, i64 7 + ret <8 x i16> %v7 +} + +define <16 x i8> @test_buildvector_v8i16_split_v16i8(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind { +; SSE2-32-LABEL: test_buildvector_v8i16_split_v16i8: +; SSE2-32: # %bb.0: +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm1 +; SSE2-32-NEXT: movdqa %xmm1, %xmm0 +; SSE2-32-NEXT: psrld $8, %xmm0 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm0 +; SSE2-32-NEXT: movdqa %xmm0, %xmm2 +; SSE2-32-NEXT: psrld $8, %xmm2 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm2 +; SSE2-32-NEXT: movdqa %xmm2, %xmm1 +; SSE2-32-NEXT: psrld $8, %xmm1 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm1 +; SSE2-32-NEXT: movdqa %xmm1, %xmm3 +; SSE2-32-NEXT: psrld $8, %xmm3 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm0 +; SSE2-32-NEXT: movdqa %xmm0, %xmm2 +; SSE2-32-NEXT: psrld $8, %xmm2 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm2 +; SSE2-32-NEXT: movdqa %xmm2, %xmm3 +; SSE2-32-NEXT: psrld $8, %xmm3 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm3 +; SSE2-32-NEXT: movdqa %xmm3, %xmm0 +; SSE2-32-NEXT: psrld $8, %xmm0 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm0 +; SSE2-32-NEXT: movdqa %xmm0, %xmm4 +; SSE2-32-NEXT: psrld $8, %xmm4 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-32-NEXT: retl +; +; SSE2-64-LABEL: test_buildvector_v8i16_split_v16i8: +; SSE2-64: # %bb.0: +; SSE2-64-NEXT: pushq %rbp +; SSE2-64-NEXT: pushq %r15 +; SSE2-64-NEXT: pushq %r14 +; SSE2-64-NEXT: pushq %rbx +; SSE2-64-NEXT: movzwl %di, %eax +; SSE2-64-NEXT: movzwl %si, %r10d +; SSE2-64-NEXT: movzwl %dx, %r11d +; SSE2-64-NEXT: movzwl %cx, %ebx +; SSE2-64-NEXT: movzwl %r8w, %ebp +; SSE2-64-NEXT: movzwl %r9w, %r14d +; SSE2-64-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d +; SSE2-64-NEXT: movd %r15d, %xmm0 +; SSE2-64-NEXT: movdqa %xmm0, %xmm1 +; SSE2-64-NEXT: psrld $8, %xmm1 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-64-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d +; SSE2-64-NEXT: movd %r15d, %xmm2 +; SSE2-64-NEXT: movdqa %xmm2, %xmm1 +; SSE2-64-NEXT: psrld $8, %xmm1 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-64-NEXT: movd %r9d, %xmm0 +; SSE2-64-NEXT: movd %r14d, %xmm1 +; SSE2-64-NEXT: psrld $8, %xmm1 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-64-NEXT: movd %r8d, %xmm1 +; SSE2-64-NEXT: movd %ebp, %xmm3 +; SSE2-64-NEXT: psrld $8, %xmm3 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-64-NEXT: movd %ecx, %xmm0 +; SSE2-64-NEXT: movd %ebx, %xmm2 +; SSE2-64-NEXT: psrld $8, %xmm2 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-64-NEXT: movd %edx, %xmm2 +; SSE2-64-NEXT: movd %r11d, %xmm3 +; SSE2-64-NEXT: psrld $8, %xmm3 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-64-NEXT: movd %esi, %xmm3 +; SSE2-64-NEXT: movd %r10d, %xmm0 +; SSE2-64-NEXT: psrld $8, %xmm0 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-64-NEXT: movd %edi, %xmm0 +; SSE2-64-NEXT: movd %eax, %xmm4 +; SSE2-64-NEXT: psrld $8, %xmm4 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-64-NEXT: popq %rbx +; SSE2-64-NEXT: popq %r14 +; SSE2-64-NEXT: popq %r15 +; SSE2-64-NEXT: popq %rbp +; SSE2-64-NEXT: retq +; +; SSE41-32-LABEL: test_buildvector_v8i16_split_v16i8: +; SSE41-32: # %bb.0: +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: movd %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $1, %eax, %xmm0 +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: pinsrb $2, %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $3, %eax, %xmm0 +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: pinsrb $4, %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $5, %eax, %xmm0 +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: pinsrb $6, %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $7, %eax, %xmm0 +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: pinsrb $8, %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $9, %eax, %xmm0 +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: pinsrb $10, %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $11, %eax, %xmm0 +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: pinsrb $12, %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $13, %eax, %xmm0 +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: pinsrb $14, %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-32-NEXT: retl +; +; SSE41-64-LABEL: test_buildvector_v8i16_split_v16i8: +; SSE41-64: # %bb.0: +; SSE41-64-NEXT: movd %edi, %xmm0 +; SSE41-64-NEXT: shrl $8, %edi +; SSE41-64-NEXT: pinsrb $1, %edi, %xmm0 +; SSE41-64-NEXT: pinsrb $2, %esi, %xmm0 +; SSE41-64-NEXT: shrl $8, %esi +; SSE41-64-NEXT: pinsrb $3, %esi, %xmm0 +; SSE41-64-NEXT: pinsrb $4, %edx, %xmm0 +; SSE41-64-NEXT: shrl $8, %edx +; SSE41-64-NEXT: pinsrb $5, %edx, %xmm0 +; SSE41-64-NEXT: pinsrb $6, %ecx, %xmm0 +; SSE41-64-NEXT: shrl $8, %ecx +; SSE41-64-NEXT: pinsrb $7, %ecx, %xmm0 +; SSE41-64-NEXT: pinsrb $8, %r8d, %xmm0 +; SSE41-64-NEXT: shrl $8, %r8d +; SSE41-64-NEXT: pinsrb $9, %r8d, %xmm0 +; SSE41-64-NEXT: pinsrb $10, %r9d, %xmm0 +; SSE41-64-NEXT: shrl $8, %r9d +; SSE41-64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-64-NEXT: pinsrb $11, %r9d, %xmm0 +; SSE41-64-NEXT: pinsrb $12, %eax, %xmm0 +; SSE41-64-NEXT: shrl $8, %eax +; SSE41-64-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; SSE41-64-NEXT: pinsrb $13, %eax, %xmm0 +; SSE41-64-NEXT: pinsrb $14, %ecx, %xmm0 +; SSE41-64-NEXT: shrl $8, %ecx +; SSE41-64-NEXT: pinsrb $15, %ecx, %xmm0 +; SSE41-64-NEXT: retq +; +; AVX-32-LABEL: test_buildvector_v8i16_split_v16i8: +; AVX-32: # %bb.0: +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vmovd %eax, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: test_buildvector_v8i16_split_v16i8: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vmovd %edi, %xmm0 +; AVX-64-NEXT: shrl $8, %edi +; AVX-64-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $8, %esi +; AVX-64-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $8, %edx +; AVX-64-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $8, %ecx +; AVX-64-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $8, %r8d, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $8, %r8d +; AVX-64-NEXT: vpinsrb $9, %r8d, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $8, %r9d +; AVX-64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX-64-NEXT: vpinsrb $11, %r9d, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $8, %eax +; AVX-64-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX-64-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $8, %ecx +; AVX-64-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: retq + %a0.lo = trunc i16 %a0 to i8 + %a1.lo = trunc i16 %a1 to i8 + %a2.lo = trunc i16 %a2 to i8 + %a3.lo = trunc i16 %a3 to i8 + %a4.lo = trunc i16 %a4 to i8 + %a5.lo = trunc i16 %a5 to i8 + %a6.lo = trunc i16 %a6 to i8 + %a7.lo = trunc i16 %a7 to i8 + %a0.shr = lshr i16 %a0, 8 + %a1.shr = lshr i16 %a1, 8 + %a2.shr = lshr i16 %a2, 8 + %a3.shr = lshr i16 %a3, 8 + %a4.shr = lshr i16 %a4, 8 + %a5.shr = lshr i16 %a5, 8 + %a6.shr = lshr i16 %a6, 8 + %a7.shr = lshr i16 %a7, 8 + %a0.hi = trunc i16 %a0.shr to i8 + %a1.hi = trunc i16 %a1.shr to i8 + %a2.hi = trunc i16 %a2.shr to i8 + %a3.hi = trunc i16 %a3.shr to i8 + %a4.hi = trunc i16 %a4.shr to i8 + %a5.hi = trunc i16 %a5.shr to i8 + %a6.hi = trunc i16 %a6.shr to i8 + %a7.hi = trunc i16 %a7.shr to i8 + %v0 = insertelement <16 x i8> poison, i8 %a0.lo, i64 0 + %v1 = insertelement <16 x i8> %v0, i8 %a0.hi, i64 1 + %v2 = insertelement <16 x i8> %v1, i8 %a1.lo, i64 2 + %v3 = insertelement <16 x i8> %v2, i8 %a1.hi, i64 3 + %v4 = insertelement <16 x i8> %v3, i8 %a2.lo, i64 4 + %v5 = insertelement <16 x i8> %v4, i8 %a2.hi, i64 5 + %v6 = insertelement <16 x i8> %v5, i8 %a3.lo, i64 6 + %v7 = insertelement <16 x i8> %v6, i8 %a3.hi, i64 7 + %v8 = insertelement <16 x i8> %v7, i8 %a4.lo, i64 8 + %v9 = insertelement <16 x i8> %v8, i8 %a4.hi, i64 9 + %v10 = insertelement <16 x i8> %v9, i8 %a5.lo, i64 10 + %v11 = insertelement <16 x i8> %v10, i8 %a5.hi, i64 11 + %v12 = insertelement <16 x i8> %v11, i8 %a6.lo, i64 12 + %v13 = insertelement <16 x i8> %v12, i8 %a6.hi, i64 13 + %v14 = insertelement <16 x i8> %v13, i8 %a7.lo, i64 14 + %v15 = insertelement <16 x i8> %v14, i8 %a7.hi, i64 15 + ret <16 x i8> %v15 +} + ; build vectors of repeated elements define <4 x float> @test_buildvector_4f32_2_var(float %a0, float %a1) { From 1a88d040895223a5eeae14be2b8a7e317a1cbe3a Mon Sep 17 00:00:00 2001 From: wdx727 Date: Wed, 12 Nov 2025 01:17:24 +0800 Subject: [PATCH 02/64] Adding Matching and Inference Functionality to Propeller-PR4: Implement matching and inference and create clusters (#165868) Adding Matching and Inference Functionality to Propeller. For detailed information, please refer to the following RFC: https://discourse.llvm.org/t/rfc-adding-matching-and-inference-functionality-to-propeller/86238. This is the fourth PR, which is used to implement matching and inference and create the clusters. The associated PRs are: PR1: https://github.com/llvm/llvm-project/pull/160706 PR2: https://github.com/llvm/llvm-project/pull/162963 PR3: https://github.com/llvm/llvm-project/pull/164223 co-authors: lifengxiang1025 [lifengxiang@kuaishou.com](mailto:lifengxiang@kuaishou.com); zcfh [wuminghui03@kuaishou.com](mailto:wuminghui03@kuaishou.com) Co-authored-by: lifengxiang1025 Co-authored-by: zcfh --- .../CodeGen/BasicBlockMatchingAndInference.h | 62 ++++++ .../CodeGen/BasicBlockSectionsProfileReader.h | 7 + .../llvm/CodeGen/MachineBlockHashInfo.h | 2 + llvm/include/llvm/CodeGen/Passes.h | 4 + llvm/include/llvm/InitializePasses.h | 1 + .../Transforms/Utils/SampleProfileInference.h | 16 ++ .../BasicBlockMatchingAndInference.cpp | 195 ++++++++++++++++++ llvm/lib/CodeGen/BasicBlockSections.cpp | 85 +++++++- .../BasicBlockSectionsProfileReader.cpp | 15 ++ llvm/lib/CodeGen/CMakeLists.txt | 1 + llvm/lib/CodeGen/TargetPassConfig.cpp | 13 +- .../Utils/SampleProfileInference.cpp | 2 - .../basic-block-sections-clusters-bb-hash.ll | 99 +++++++++ 13 files changed, 496 insertions(+), 6 deletions(-) create mode 100644 llvm/include/llvm/CodeGen/BasicBlockMatchingAndInference.h create mode 100644 llvm/lib/CodeGen/BasicBlockMatchingAndInference.cpp create mode 100644 llvm/test/CodeGen/X86/basic-block-sections-clusters-bb-hash.ll diff --git a/llvm/include/llvm/CodeGen/BasicBlockMatchingAndInference.h b/llvm/include/llvm/CodeGen/BasicBlockMatchingAndInference.h new file mode 100644 index 0000000000000..6e9bbb969a445 --- /dev/null +++ b/llvm/include/llvm/CodeGen/BasicBlockMatchingAndInference.h @@ -0,0 +1,62 @@ +//===- llvm/CodeGen/BasicBlockMatchingAndInference.h ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Infer weights for all basic blocks using matching and inference. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_BASIC_BLOCK_AND_INFERENCE_H +#define LLVM_CODEGEN_BASIC_BLOCK_AND_INFERENCE_H + +#include "llvm/CodeGen/BasicBlockSectionsProfileReader.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Transforms/Utils/SampleProfileInference.h" + +namespace llvm { + +class BasicBlockMatchingAndInference : public MachineFunctionPass { +private: + using Edge = std::pair; + using BlockWeightMap = DenseMap; + using EdgeWeightMap = DenseMap; + using BlockEdgeMap = DenseMap>; + + struct WeightInfo { + // Weight of basic blocks. + BlockWeightMap BlockWeights; + // Weight of edges. + EdgeWeightMap EdgeWeights; + }; + +public: + static char ID; + BasicBlockMatchingAndInference(); + + StringRef getPassName() const override { + return "Basic Block Matching and Inference"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override; + + bool runOnMachineFunction(MachineFunction &F) override; + + std::optional getWeightInfo(StringRef FuncName) const; + +private: + StringMap ProgramWeightInfo; + + WeightInfo initWeightInfoByMatching(MachineFunction &MF); + + void generateWeightInfoByInference(MachineFunction &MF, + WeightInfo &MatchWeight); +}; + +} // end namespace llvm + +#endif // LLVM_CODEGEN_BASIC_BLOCK_AND_INFERENCE_H diff --git a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h index ee1f28377f7e4..f0d28d863282e 100644 --- a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h +++ b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h @@ -86,6 +86,10 @@ class BasicBlockSectionsProfileReader { uint64_t getEdgeCount(StringRef FuncName, const UniqueBBID &SrcBBID, const UniqueBBID &SinkBBID) const; + // Return the complete function path and cluster info for the given function. + std::pair + getFunctionPathAndClusterInfo(StringRef FuncName) const; + private: StringRef getAliasName(StringRef FuncName) const { auto R = FuncAliasMap.find(FuncName); @@ -195,6 +199,9 @@ class BasicBlockSectionsProfileReaderWrapperPass : public ImmutablePass { uint64_t getEdgeCount(StringRef FuncName, const UniqueBBID &SrcBBID, const UniqueBBID &DestBBID) const; + std::pair + getFunctionPathAndClusterInfo(StringRef FuncName) const; + // Initializes the FunctionNameToDIFilename map for the current module and // then reads the profile for the matching functions. bool doInitialization(Module &M) override; diff --git a/llvm/include/llvm/CodeGen/MachineBlockHashInfo.h b/llvm/include/llvm/CodeGen/MachineBlockHashInfo.h index d044d5f940b75..6f26819d566ae 100644 --- a/llvm/include/llvm/CodeGen/MachineBlockHashInfo.h +++ b/llvm/include/llvm/CodeGen/MachineBlockHashInfo.h @@ -80,6 +80,8 @@ struct BlendedBlockHash { return Dist; } + uint16_t getOpcodeHash() const { return OpcodeHash; } + private: /// The offset of the basic block from the function start. uint16_t Offset{0}; diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index a8525554b142e..2bf83cfa655b6 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -69,6 +69,10 @@ LLVM_ABI MachineFunctionPass *createBasicBlockSectionsPass(); LLVM_ABI MachineFunctionPass *createBasicBlockPathCloningPass(); +/// createBasicBlockMatchingAndInferencePass - This pass enables matching +/// and inference when using propeller. +LLVM_ABI MachineFunctionPass *createBasicBlockMatchingAndInferencePass(); + /// createMachineBlockHashInfoPass - This pass computes basic block hashes. LLVM_ABI MachineFunctionPass *createMachineBlockHashInfoPass(); diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 10a4d8525a9e8..18732caf78966 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -55,6 +55,7 @@ LLVM_ABI void initializeAlwaysInlinerLegacyPassPass(PassRegistry &); LLVM_ABI void initializeAssignmentTrackingAnalysisPass(PassRegistry &); LLVM_ABI void initializeAssumptionCacheTrackerPass(PassRegistry &); LLVM_ABI void initializeAtomicExpandLegacyPass(PassRegistry &); +LLVM_ABI void initializeBasicBlockMatchingAndInferencePass(PassRegistry &); LLVM_ABI void initializeBasicBlockPathCloningPass(PassRegistry &); LLVM_ABI void initializeBasicBlockSectionsProfileReaderWrapperPassPass(PassRegistry &); diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h b/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h index 7231e45fe8eb7..e1663d29c1e3c 100644 --- a/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h +++ b/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h @@ -130,6 +130,11 @@ template class SampleProfileInference { SampleProfileInference(FunctionT &F, BlockEdgeMap &Successors, BlockWeightMap &SampleBlockWeights) : F(F), Successors(Successors), SampleBlockWeights(SampleBlockWeights) {} + SampleProfileInference(FunctionT &F, BlockEdgeMap &Successors, + BlockWeightMap &SampleBlockWeights, + EdgeWeightMap &SampleEdgeWeights) + : F(F), Successors(Successors), SampleBlockWeights(SampleBlockWeights), + SampleEdgeWeights(SampleEdgeWeights) {} /// Apply the profile inference algorithm for a given function void apply(BlockWeightMap &BlockWeights, EdgeWeightMap &EdgeWeights); @@ -157,6 +162,9 @@ template class SampleProfileInference { /// Map basic blocks to their sampled weights. BlockWeightMap &SampleBlockWeights; + + /// Map edges to their sampled weights. + EdgeWeightMap SampleEdgeWeights; }; template @@ -266,6 +274,14 @@ FlowFunction SampleProfileInference::createFlowFunction( FlowJump Jump; Jump.Source = BlockIndex[BB]; Jump.Target = BlockIndex[Succ]; + auto It = SampleEdgeWeights.find(std::make_pair(BB, Succ)); + if (It != SampleEdgeWeights.end()) { + Jump.HasUnknownWeight = false; + Jump.Weight = It->second; + } else { + Jump.HasUnknownWeight = true; + Jump.Weight = 0; + } Func.Jumps.push_back(Jump); } } diff --git a/llvm/lib/CodeGen/BasicBlockMatchingAndInference.cpp b/llvm/lib/CodeGen/BasicBlockMatchingAndInference.cpp new file mode 100644 index 0000000000000..4fa90799f4e10 --- /dev/null +++ b/llvm/lib/CodeGen/BasicBlockMatchingAndInference.cpp @@ -0,0 +1,195 @@ +//===- llvm/CodeGen/BasicBlockMatchingAndInference.cpp ----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// In Propeller's profile, we have already read the hash values of basic blocks, +// as well as the weights of basic blocks and edges in the CFG. In this file, +// we first match the basic blocks in the profile with those in the current +// MachineFunction using the basic block hash, thereby obtaining the weights of +// some basic blocks and edges. Subsequently, we infer the weights of all basic +// blocks using an inference algorithm. +// +// TODO: Integrate part of the code in this file with BOLT's implementation into +// the LLVM infrastructure, enabling both BOLT and Propeller to reuse it. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/BasicBlockMatchingAndInference.h" +#include "llvm/CodeGen/BasicBlockSectionsProfileReader.h" +#include "llvm/CodeGen/MachineBlockHashInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/InitializePasses.h" +#include + +using namespace llvm; + +static cl::opt + PropellerInferThreshold("propeller-infer-threshold", + cl::desc("Threshold for infer stale profile"), + cl::init(0.6), cl::Optional); + +/// The object is used to identify and match basic blocks given their hashes. +class StaleMatcher { +public: + /// Initialize stale matcher. + void init(const std::vector &Blocks, + const std::vector &Hashes) { + assert(Blocks.size() == Hashes.size() && + "incorrect matcher initialization"); + for (size_t I = 0; I < Blocks.size(); I++) { + MachineBasicBlock *Block = Blocks[I]; + uint16_t OpHash = Hashes[I].getOpcodeHash(); + OpHashToBlocks[OpHash].push_back(std::make_pair(Hashes[I], Block)); + } + } + + /// Find the most similar block for a given hash. + MachineBasicBlock *matchBlock(BlendedBlockHash BlendedHash) const { + auto BlockIt = OpHashToBlocks.find(BlendedHash.getOpcodeHash()); + if (BlockIt == OpHashToBlocks.end()) { + return nullptr; + } + MachineBasicBlock *BestBlock = nullptr; + uint64_t BestDist = std::numeric_limits::max(); + for (auto It : BlockIt->second) { + MachineBasicBlock *Block = It.second; + BlendedBlockHash Hash = It.first; + uint64_t Dist = Hash.distance(BlendedHash); + if (BestBlock == nullptr || Dist < BestDist) { + BestDist = Dist; + BestBlock = Block; + } + } + return BestBlock; + } + +private: + using HashBlockPairType = std::pair; + std::unordered_map> OpHashToBlocks; +}; + +INITIALIZE_PASS_BEGIN(BasicBlockMatchingAndInference, + "machine-block-match-infer", + "Machine Block Matching and Inference Analysis", true, + true) +INITIALIZE_PASS_DEPENDENCY(MachineBlockHashInfo) +INITIALIZE_PASS_DEPENDENCY(BasicBlockSectionsProfileReaderWrapperPass) +INITIALIZE_PASS_END(BasicBlockMatchingAndInference, "machine-block-match-infer", + "Machine Block Matching and Inference Analysis", true, true) + +char BasicBlockMatchingAndInference::ID = 0; + +BasicBlockMatchingAndInference::BasicBlockMatchingAndInference() + : MachineFunctionPass(ID) { + initializeBasicBlockMatchingAndInferencePass( + *PassRegistry::getPassRegistry()); +} + +void BasicBlockMatchingAndInference::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +std::optional +BasicBlockMatchingAndInference::getWeightInfo(StringRef FuncName) const { + auto It = ProgramWeightInfo.find(FuncName); + if (It == ProgramWeightInfo.end()) { + return std::nullopt; + } + return It->second; +} + +BasicBlockMatchingAndInference::WeightInfo +BasicBlockMatchingAndInference::initWeightInfoByMatching(MachineFunction &MF) { + std::vector Blocks; + std::vector Hashes; + auto BSPR = &getAnalysis(); + auto MBHI = &getAnalysis(); + for (auto &Block : MF) { + Blocks.push_back(&Block); + Hashes.push_back(BlendedBlockHash(MBHI->getMBBHash(Block))); + } + StaleMatcher Matcher; + Matcher.init(Blocks, Hashes); + BasicBlockMatchingAndInference::WeightInfo MatchWeight; + auto [IsValid, PathAndClusterInfo] = + BSPR->getFunctionPathAndClusterInfo(MF.getName()); + if (!IsValid) + return MatchWeight; + for (auto &BlockCount : PathAndClusterInfo.NodeCounts) { + if (PathAndClusterInfo.BBHashes.count(BlockCount.first.BaseID)) { + auto Hash = PathAndClusterInfo.BBHashes[BlockCount.first.BaseID]; + MachineBasicBlock *Block = Matcher.matchBlock(BlendedBlockHash(Hash)); + // When a basic block has clone copies, sum their counts. + if (Block != nullptr) + MatchWeight.BlockWeights[Block] += BlockCount.second; + } + } + for (auto &PredItem : PathAndClusterInfo.EdgeCounts) { + auto PredID = PredItem.first.BaseID; + if (!PathAndClusterInfo.BBHashes.count(PredID)) + continue; + auto PredHash = PathAndClusterInfo.BBHashes[PredID]; + MachineBasicBlock *PredBlock = + Matcher.matchBlock(BlendedBlockHash(PredHash)); + if (PredBlock == nullptr) + continue; + for (auto &SuccItem : PredItem.second) { + auto SuccID = SuccItem.first.BaseID; + auto EdgeWeight = SuccItem.second; + if (PathAndClusterInfo.BBHashes.count(SuccID)) { + auto SuccHash = PathAndClusterInfo.BBHashes[SuccID]; + MachineBasicBlock *SuccBlock = + Matcher.matchBlock(BlendedBlockHash(SuccHash)); + // When an edge has clone copies, sum their counts. + if (SuccBlock != nullptr) + MatchWeight.EdgeWeights[std::make_pair(PredBlock, SuccBlock)] += + EdgeWeight; + } + } + } + return MatchWeight; +} + +void BasicBlockMatchingAndInference::generateWeightInfoByInference( + MachineFunction &MF, + BasicBlockMatchingAndInference::WeightInfo &MatchWeight) { + BlockEdgeMap Successors; + for (auto &Block : MF) { + for (auto *Succ : Block.successors()) + Successors[&Block].push_back(Succ); + } + SampleProfileInference SPI( + MF, Successors, MatchWeight.BlockWeights, MatchWeight.EdgeWeights); + BlockWeightMap BlockWeights; + EdgeWeightMap EdgeWeights; + SPI.apply(BlockWeights, EdgeWeights); + ProgramWeightInfo.try_emplace( + MF.getName(), BasicBlockMatchingAndInference::WeightInfo{ + std::move(BlockWeights), std::move(EdgeWeights)}); +} + +bool BasicBlockMatchingAndInference::runOnMachineFunction(MachineFunction &MF) { + if (MF.empty()) + return false; + auto MatchWeight = initWeightInfoByMatching(MF); + // If the ratio of the number of MBBs in matching to the total number of MBBs + // in the function is less than the threshold value, the processing should be + // abandoned. + if (static_cast(MatchWeight.BlockWeights.size()) / MF.size() < + PropellerInferThreshold) { + return false; + } + generateWeightInfoByInference(MF, MatchWeight); + return false; +} + +MachineFunctionPass *llvm::createBasicBlockMatchingAndInferencePass() { + return new BasicBlockMatchingAndInference(); +} diff --git a/llvm/lib/CodeGen/BasicBlockSections.cpp b/llvm/lib/CodeGen/BasicBlockSections.cpp index 52e2909bec072..87cd55be23194 100644 --- a/llvm/lib/CodeGen/BasicBlockSections.cpp +++ b/llvm/lib/CodeGen/BasicBlockSections.cpp @@ -70,6 +70,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/BasicBlockMatchingAndInference.h" #include "llvm/CodeGen/BasicBlockSectionUtils.h" #include "llvm/CodeGen/BasicBlockSectionsProfileReader.h" #include "llvm/CodeGen/MachineDominators.h" @@ -81,6 +82,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Support/UniqueBBID.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/CodeLayout.h" #include using namespace llvm; @@ -175,6 +177,79 @@ updateBranches(MachineFunction &MF, } } +// This function generates the machine basic block clusters of "hot" blocks. +// Currently, only support one cluster creation. +// TODO: Support multi-cluster creation and path cloning. +static SmallVector +createBBClusterInfoForFunction(MachineFunction &MF, + BasicBlockMatchingAndInference *BMI) { + unsigned CurrentCluster = 0; + SmallVector BBClusterInfos; + auto OptWeightInfo = BMI->getWeightInfo(MF.getName()); + if (!OptWeightInfo) + return BBClusterInfos; + auto BlockWeights = OptWeightInfo->BlockWeights; + auto EdgeWeights = OptWeightInfo->EdgeWeights; + + SmallVector HotMBBs; + if (MF.size() <= 2) { + for (auto &MBB : MF) { + if (MBB.isEntryBlock() || BlockWeights[&MBB] > 0) { + HotMBBs.push_back(&MBB); + } + } + } else { + SmallVector BlockSizes(MF.size()); + SmallVector BlockCounts(MF.size()); + std::vector OrigOrder; + OrigOrder.reserve(MF.size()); + SmallVector JumpCounts; + + // Renumber blocks for running the layout algorithm. + MF.RenumberBlocks(); + + // Init the MBB size and count. + for (auto &MBB : MF) { + auto NonDbgInsts = + instructionsWithoutDebug(MBB.instr_begin(), MBB.instr_end()); + int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end()); + BlockSizes[MBB.getNumber()] = 4 * NumInsts; + BlockCounts[MBB.getNumber()] = BlockWeights[&MBB]; + OrigOrder.push_back(&MBB); + } + + // Init the edge count. + for (auto &MBB : MF) { + for (auto *Succ : MBB.successors()) { + auto EdgeWeight = EdgeWeights[std::make_pair(&MBB, Succ)]; + JumpCounts.push_back({static_cast(MBB.getNumber()), + static_cast(Succ->getNumber()), + EdgeWeight}); + } + } + + // Run the layout algorithm. + auto Result = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts); + for (uint64_t R : Result) { + auto Block = OrigOrder[R]; + if (Block->isEntryBlock() || BlockWeights[Block] > 0) + HotMBBs.push_back(Block); + } + } + + // Generate the "hot" basic block cluster. + if (!HotMBBs.empty()) { + unsigned CurrentPosition = 0; + for (auto &MBB : HotMBBs) { + if (MBB->getBBID()) { + BBClusterInfos.push_back( + {*(MBB->getBBID()), CurrentCluster, CurrentPosition++}); + } + } + } + return BBClusterInfos; +} + // This function sorts basic blocks according to the cluster's information. // All explicitly specified clusters of basic blocks will be ordered // accordingly. All non-specified BBs go into a separate "Cold" section. @@ -308,8 +383,13 @@ bool BasicBlockSections::handleBBSections(MachineFunction &MF) { DenseMap FuncClusterInfo; if (BBSectionsType == BasicBlockSection::List) { - auto ClusterInfo = getAnalysis() - .getClusterInfoForFunction(MF.getName()); + SmallVector ClusterInfo; + if (auto *BMI = getAnalysisIfAvailable()) { + ClusterInfo = createBBClusterInfoForFunction(MF, BMI); + } else { + ClusterInfo = getAnalysis() + .getClusterInfoForFunction(MF.getName()); + } if (ClusterInfo.empty()) return false; for (auto &BBClusterInfo : ClusterInfo) { @@ -399,6 +479,7 @@ bool BasicBlockSections::runOnMachineFunction(MachineFunction &MF) { void BasicBlockSections::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); AU.addRequired(); + AU.addUsedIfAvailable(); AU.addUsedIfAvailable(); AU.addUsedIfAvailable(); MachineFunctionPass::getAnalysisUsage(AU); diff --git a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp index c234c0f1b0b34..be1c60c57ccf4 100644 --- a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp +++ b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp @@ -93,6 +93,15 @@ uint64_t BasicBlockSectionsProfileReader::getEdgeCount( return EdgeIt->second; } +std::pair +BasicBlockSectionsProfileReader::getFunctionPathAndClusterInfo( + StringRef FuncName) const { + auto R = ProgramPathAndClusterInfo.find(getAliasName(FuncName)); + return R != ProgramPathAndClusterInfo.end() + ? std::pair(true, R->second) + : std::pair(false, FunctionPathAndClusterInfo()); +} + // Reads the version 1 basic block sections profile. Profile for each function // is encoded as follows: // m @@ -514,6 +523,12 @@ uint64_t BasicBlockSectionsProfileReaderWrapperPass::getEdgeCount( return BBSPR.getEdgeCount(FuncName, SrcBBID, SinkBBID); } +std::pair +BasicBlockSectionsProfileReaderWrapperPass::getFunctionPathAndClusterInfo( + StringRef FuncName) const { + return BBSPR.getFunctionPathAndClusterInfo(FuncName); +} + BasicBlockSectionsProfileReader & BasicBlockSectionsProfileReaderWrapperPass::getBBSPR() { return BBSPR; diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt index 1cf0b4964760b..30237e66ed0ec 100644 --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -35,6 +35,7 @@ add_llvm_component_library(LLVMCodeGen BasicBlockSections.cpp BasicBlockPathCloning.cpp BasicBlockSectionsProfileReader.cpp + BasicBlockMatchingAndInference.cpp CalcSpillWeights.cpp CallBrPrepare.cpp CallingConvLower.cpp diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index 10b723887b21f..d94cc70da0ef0 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -272,6 +272,12 @@ static cl::opt cl::desc("Split static data sections into hot and cold " "sections using profile information")); +/// Enable matching and inference when using propeller. +static cl::opt PropellerMatchInfer( + "propeller-match-infer", + cl::desc("Enable matching and inference when using propeller"), + cl::init(false), cl::Optional); + cl::opt EmitBBHash( "emit-bb-hash", cl::desc( @@ -1287,12 +1293,15 @@ void TargetPassConfig::addMachinePasses() { // address map (or both). if (TM->getBBSectionsType() != llvm::BasicBlockSection::None || TM->Options.BBAddrMap) { - if (EmitBBHash) + if (EmitBBHash || PropellerMatchInfer) addPass(llvm::createMachineBlockHashInfoPass()); if (TM->getBBSectionsType() == llvm::BasicBlockSection::List) { addPass(llvm::createBasicBlockSectionsProfileReaderWrapperPass( TM->getBBSectionsFuncListBuf())); - addPass(llvm::createBasicBlockPathCloningPass()); + if (PropellerMatchInfer) + addPass(llvm::createBasicBlockMatchingAndInferencePass()); + else + addPass(llvm::createBasicBlockPathCloningPass()); } addPass(llvm::createBasicBlockSectionsPass()); } diff --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp index 53bcaa6d3df03..934d1589c4a2e 100644 --- a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp +++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp @@ -1174,8 +1174,6 @@ std::pair assignJumpCosts(const ProfiParams &Params, else CostInc = Params.CostJumpUnknownInc; CostDec = 0; - } else { - assert(Jump.Weight > 0 && "found zero-weight jump with a positive weight"); } return std::make_pair(CostInc, CostDec); } diff --git a/llvm/test/CodeGen/X86/basic-block-sections-clusters-bb-hash.ll b/llvm/test/CodeGen/X86/basic-block-sections-clusters-bb-hash.ll new file mode 100644 index 0000000000000..0ce3a522b932d --- /dev/null +++ b/llvm/test/CodeGen/X86/basic-block-sections-clusters-bb-hash.ll @@ -0,0 +1,99 @@ +; BB cluster section tests when using edges profile and basic block hashes to generate clusters. +; +; Test1: Basic blocks #0 (entry), #1 and #3 will be placed in the same section. +; The rest will be placed in the cold section. +; +; RUN: llc %s -O0 -mtriple=x86_64-pc-linux -function-sections -filetype=obj -basic-block-address-map -emit-bb-hash -o %t.o +; +; RUN: echo 'v1' > %t1 +; RUN: echo 'f foo' >> %t1 +; RUN: echo 'g 0:100,1:100,2:0 1:100,3:100 2:0,3:0 3:100' >> %t1 +; +; These commands read BB hashes from SHT_LLVM_BB_ADDR_MAP +; and put them into the basic blocks sections profile. +; RUN: llvm-readobj %t.o --bb-addr-map | \ +; RUN: awk 'BEGIN {printf "h"} \ +; RUN: /ID: [0-9]+/ {id=$2} \ +; RUN: /Hash: 0x[0-9A-Fa-f]+/ {gsub(/^0x/, "", $2); hash=$2; printf " %%s:%%s", id, hash} \ +; RUN: END {print ""}' \ +; RUN: >> %t1 +; +; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t1 -propeller-match-infer | \ +; RUN: FileCheck %s -check-prefix=LINUX-SECTIONS1 +; +; Test2: Basic #0 (entry), #2 and #3 will be placed in the same section. +; The rest will be placed in the cold section. +; +; RUN: echo 'v1' > %t2 +; RUN: echo 'f foo' >> %t2 +; RUN: echo 'g 0:100,1:0,2:100 1:0,3:0 2:100,3:100 3:100' >> %t2 +; +; These commands read BB hashes from SHT_LLVM_BB_ADDR_MAP +; and put them into the basic blocks sections profile. +; RUN: llvm-readobj %t.o --bb-addr-map | \ +; RUN: awk 'BEGIN {printf "h"} \ +; RUN: /ID: [0-9]+/ {id=$2} \ +; RUN: /Hash: 0x[0-9A-Fa-f]+/ {gsub(/^0x/, "", $2); hash=$2; printf " %%s:%%s", id, hash} \ +; RUN: END {print ""}' \ +; RUN: >> %t2 +; +; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t2 -propeller-match-infer | \ +; RUN: FileCheck %s -check-prefix=LINUX-SECTIONS2 + +define void @foo(i1 zeroext) nounwind { + %2 = alloca i8, align 1 + %3 = zext i1 %0 to i8 + store i8 %3, ptr %2, align 1 + %4 = load i8, ptr %2, align 1 + %5 = trunc i8 %4 to i1 + br i1 %5, label %6, label %8 + +6: ; preds = %1 + %7 = call i32 @bar() + br label %10 + +8: ; preds = %1 + %9 = call i32 @baz() + br label %10 + +10: ; preds = %8, %6 + ret void +} + +declare i32 @bar() #1 + +declare i32 @baz() #1 + +; LINUX-SECTIONS1: .section .text.foo,"ax",@progbits +; LINUX-SECTIONS1-NOT: .section +; LINUX-SECTIONS1-LABEL: foo: +; LINUX-SECTIONS1-NOT: .section +; LINUX-SECTIONS1-NOT: .LBB_END0_{{0-9}}+ +; LINUX-SECTIONS1-LABEL: # %bb.1: +; LINUX-SECTIONS1-NOT: .section +; LINUX-SECTIONS1-NOT: .LBB_END0_{{0-9}}+ +; LINUX-SECTIONS1-LABEL: .LBB0_3: +; LINUX-SECTIONS1-LABEL: .LBB_END0_3: +; LINUX-SECTIONS1-NEXT: .section .text.split.foo,"ax",@progbits +; LINUX-SECTIONS1-LABEL: foo.cold: +; LINUX-SECTIONS1-LABEL: .LBB_END0_2: +; LINUX-SECTIONS1-NEXT: .size foo.cold, .LBB_END0_2-foo.cold +; LINUX-SECTIONS1-LABEL: .Lfunc_end0: +; LINUX-SECTIONS1-NEXT: .size foo, .Lfunc_end0-foo + +; LINUX-SECTIONS2: .section .text.foo,"ax",@progbits +; LINUX-SECTIONS2-NOT: .section +; LINUX-SECTIONS2-LABEL: foo: +; LINUX-SECTIONS2-NOT: .section +; LINUX-SECTIONS2-NOT: .LBB_END0_{{0-9}}+ +; LINUX-SECTIONS2-LABEL: # %bb.2: +; LINUX-SECTIONS2-NOT: .section +; LINUX-SECTIONS2-NOT: .LBB_END0_{{0-9}}+ +; LINUX-SECTIONS2-LABEL: .LBB0_3: +; LINUX-SECTIONS2-LABEL: .LBB_END0_3: +; LINUX-SECTIONS2-NEXT: .section .text.split.foo,"ax",@progbits +; LINUX-SECTIONS2-LABEL: foo.cold: +; LINUX-SECTIONS2-LABEL: .LBB_END0_1: +; LINUX-SECTIONS2-NEXT: .size foo.cold, .LBB_END0_1-foo.cold +; LINUX-SECTIONS2-LABEL: .Lfunc_end0: +; LINUX-SECTIONS2-NEXT: .size foo, .Lfunc_end0-foo From 7a58b417bc9ba19d05d4c5c2de26b0359827277b Mon Sep 17 00:00:00 2001 From: Nabeel Omer Date: Tue, 11 Nov 2025 17:25:49 +0000 Subject: [PATCH 03/64] Add FramePointerKind::NonLeafNoReserve (#163775) This patch adds a new `FramePointerKind::NonLeafNoReserve` and makes it the default for `-momit-leaf-frame-pointer`. It also adds a new commandline option `-m[no-]reserve-frame-pointer-reg`. This should fix #154379, the main impact of this patch can be found in `clang/lib/Driver/ToolChains/CommonArgs.cpp`. --- clang/include/clang/Basic/CodeGenOptions.def | 2 +- clang/include/clang/Basic/CodeGenOptions.h | 13 +- clang/include/clang/Options/Options.td | 7 +- clang/lib/CodeGen/CGCall.cpp | 1 + clang/lib/CodeGen/CodeGenModule.cpp | 3 + clang/lib/Driver/ToolChains/Clang.cpp | 3 + clang/lib/Driver/ToolChains/CommonArgs.cpp | 47 +- clang/lib/Driver/ToolChains/Flang.cpp | 3 + clang/test/Driver/frame-pointer-elim.c | 77 +- clang/test/Driver/fuchsia.c | 2 +- .../include/flang/Frontend/CodeGenOptions.def | 2 +- flang/lib/Frontend/CompilerInvocation.cpp | 2 + .../test/Driver/frame-pointer-forwarding.f90 | 4 +- llvm/include/llvm/Support/CodeGen.h | 8 +- llvm/lib/CodeGen/CommandFlags.cpp | 5 + llvm/lib/CodeGen/TargetOptionsImpl.cpp | 3 +- llvm/lib/IR/Function.cpp | 3 + llvm/lib/IR/Verifier.cpp | 3 +- llvm/test/CodeGen/X86/regalloc-fp.ll | 775 ++++++++++++++++++ mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td | 5 +- 20 files changed, 908 insertions(+), 60 deletions(-) create mode 100644 llvm/test/CodeGen/X86/regalloc-fp.ll diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index 90e1f8d1eb5e9..52360b67b306c 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -54,7 +54,7 @@ CODEGENOPT(SeparateNamedSections, 1, 0, Benign) ///< Set for -fseparate-named-se CODEGENOPT(EnableAIXExtendedAltivecABI, 1, 0, Benign) ///< Set for -mabi=vec-extabi. Enables the extended Altivec ABI on AIX. CODEGENOPT(XCOFFReadOnlyPointers, 1, 0, Benign) ///< Set for -mxcoff-roptr. CODEGENOPT(AllTocData, 1, 0, Benign) ///< AIX -mtocdata -ENUM_CODEGENOPT(FramePointer, FramePointerKind, 2, FramePointerKind::None, Benign) /// frame-pointer: all,non-leaf,reserved,none +ENUM_CODEGENOPT(FramePointer, FramePointerKind, 3, FramePointerKind::None, Benign) /// frame-pointer: all,non-leaf,non-leaf-no-reserve,reserved,none ENUM_CODEGENOPT(ExceptionHandling, ExceptionHandlingKind, 3, ExceptionHandlingKind::None, NotCompatible) diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h index 5d5cf250b56b9..6c445253d518b 100644 --- a/clang/include/clang/Basic/CodeGenOptions.h +++ b/clang/include/clang/Basic/CodeGenOptions.h @@ -155,10 +155,13 @@ class CodeGenOptions : public CodeGenOptionsBase { std::string BinutilsVersion; enum class FramePointerKind { - None, // Omit all frame pointers. - Reserved, // Maintain valid frame pointer chain. - NonLeaf, // Keep non-leaf frame pointers. - All, // Keep all frame pointers. + NonLeafNoReserve, // Keep non-leaf frame pointers, allow the FP to be used + // as a GPR in leaf functions. + None, // Omit all frame pointers. + Reserved, // Maintain valid frame pointer chain. + NonLeaf, // Keep non-leaf frame pointers, don't allow the FP to be used as a + // GPR in leaf functions. + All, // Keep all frame pointers. }; static StringRef getFramePointerKindName(FramePointerKind Kind) { @@ -167,6 +170,8 @@ class CodeGenOptions : public CodeGenOptionsBase { return "none"; case FramePointerKind::Reserved: return "reserved"; + case FramePointerKind::NonLeafNoReserve: + return "non-leaf-no-reserve"; case FramePointerKind::NonLeaf: return "non-leaf"; case FramePointerKind::All: diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td index 0a414c685eae6..2f7434d8afe11 100644 --- a/clang/include/clang/Options/Options.td +++ b/clang/include/clang/Options/Options.td @@ -5666,6 +5666,9 @@ def mno_warn_nonportable_cfstrings : Flag<["-"], "mno-warn-nonportable-cfstrings def mno_omit_leaf_frame_pointer : Flag<["-"], "mno-omit-leaf-frame-pointer">, Group; def momit_leaf_frame_pointer : Flag<["-"], "momit-leaf-frame-pointer">, Group, HelpText<"Omit frame pointer setup for leaf functions">; +def mno_reserve_frame_pointer_reg : Flag<["-"], "mno-reserve-frame-pointer-reg">, Group; +def mreserve_frame_pointer_reg : Flag<["-"], "mreserve-frame-pointer-reg">, Group, + HelpText<"Reserve the frame pointer register even if the function doesn't have a frame">; def moslib_EQ : Joined<["-"], "moslib=">, Group; def mpascal_strings : Flag<["-"], "mpascal-strings">, Alias; def mred_zone : Flag<["-"], "mred-zone">, Group; @@ -8494,8 +8497,8 @@ def pic_is_pie : Flag<["-"], "pic-is-pie">, MarshallingInfoFlag>; def mframe_pointer_EQ : Joined<["-"], "mframe-pointer=">, - HelpText<"Specify which frame pointers to retain.">, Values<"all,non-leaf,reserved,none">, - NormalizedValuesScope<"CodeGenOptions::FramePointerKind">, NormalizedValues<["All", "NonLeaf", "Reserved", "None"]>, + HelpText<"Specify which frame pointers to retain.">, Values<"all,non-leaf,non-leaf-no-reserve,reserved,none">, + NormalizedValuesScope<"CodeGenOptions::FramePointerKind">, NormalizedValues<["All", "NonLeaf", "NonLeafNoReserve", "Reserved", "None"]>, MarshallingInfoEnum, "None">; diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index d4d5ea80a84ec..efacb3cc04c01 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -1991,6 +1991,7 @@ static void getTrivialDefaultFunctionAttributes( // This is the default behavior. break; case CodeGenOptions::FramePointerKind::Reserved: + case CodeGenOptions::FramePointerKind::NonLeafNoReserve: case CodeGenOptions::FramePointerKind::NonLeaf: case CodeGenOptions::FramePointerKind::All: FuncAttrs.addAttribute("frame-pointer", diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 98d59b79ab881..f303550c64292 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -1512,6 +1512,9 @@ void CodeGenModule::Release() { case CodeGenOptions::FramePointerKind::Reserved: getModule().setFramePointer(llvm::FramePointerKind::Reserved); break; + case CodeGenOptions::FramePointerKind::NonLeafNoReserve: + getModule().setFramePointer(llvm::FramePointerKind::NonLeafNoReserve); + break; case CodeGenOptions::FramePointerKind::NonLeaf: getModule().setFramePointer(llvm::FramePointerKind::NonLeaf); break; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 2791b1e57877e..80389937ee218 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5704,6 +5704,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, case CodeGenOptions::FramePointerKind::Reserved: FPKeepKindStr = "-mframe-pointer=reserved"; break; + case CodeGenOptions::FramePointerKind::NonLeafNoReserve: + FPKeepKindStr = "-mframe-pointer=non-leaf-no-reserve"; + break; case CodeGenOptions::FramePointerKind::NonLeaf: FPKeepKindStr = "-mframe-pointer=non-leaf"; break; diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 9e3ca9f281195..4c036f0f8dee3 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -221,26 +221,39 @@ static bool framePointerImpliesLeafFramePointer(const llvm::opt::ArgList &Args, clang::CodeGenOptions::FramePointerKind getFramePointerKind(const llvm::opt::ArgList &Args, const llvm::Triple &Triple) { - // There are three things to consider here: + // There are four things to consider here: // * Should a frame record be created for non-leaf functions? // * Should a frame record be created for leaf functions? - // * Is the frame pointer register reserved, i.e. must it always point to - // either a new, valid frame record or be un-modified? + // * Is the frame pointer register reserved in non-leaf functions? + // i.e. must it always point to either a new, valid frame record or be + // un-modified? + // * Is the frame pointer register reserved in leaf functions? // // Not all combinations of these are valid: // * It's not useful to have leaf frame records without non-leaf ones. // * It's not useful to have frame records without reserving the frame // pointer. // - // | Non-leaf | Leaf | Reserved | - // | N | N | N | FramePointerKind::None - // | N | N | Y | FramePointerKind::Reserved - // | N | Y | N | Invalid - // | N | Y | Y | Invalid - // | Y | N | N | Invalid - // | Y | N | Y | FramePointerKind::NonLeaf - // | Y | Y | N | Invalid - // | Y | Y | Y | FramePointerKind::All + // | Frame Setup | Reg Reserved | + // |-----------------|-----------------| + // | Non-leaf | Leaf | Non-Leaf | Leaf | + // |----------|------|----------|------| + // | N | N | N | N | FramePointerKind::None + // | N | N | N | Y | Invalid + // | N | N | Y | N | Invalid + // | N | N | Y | Y | FramePointerKind::Reserved + // | N | Y | N | N | Invalid + // | N | Y | N | Y | Invalid + // | N | Y | Y | N | Invalid + // | N | Y | Y | Y | Invalid + // | Y | N | N | N | Invalid + // | Y | N | N | Y | Invalid + // | Y | N | Y | N | FramePointerKind::NonLeafNoReserve + // | Y | N | Y | Y | FramePointerKind::NonLeaf + // | Y | Y | N | N | Invalid + // | Y | Y | N | Y | Invalid + // | Y | Y | Y | N | Invalid + // | Y | Y | Y | Y | FramePointerKind::All // // The FramePointerKind::Reserved case is currently only reachable for Arm, // which has the -mframe-chain= option which can (in combination with @@ -259,12 +272,18 @@ getFramePointerKind(const llvm::opt::ArgList &Args, Args.hasFlag(options::OPT_mno_omit_leaf_frame_pointer, options::OPT_momit_leaf_frame_pointer, DefaultLeafFP); - bool FPRegReserved = EnableFP || mustMaintainValidFrameChain(Args, Triple); + bool FPRegReserved = Args.hasFlag(options::OPT_mreserve_frame_pointer_reg, + options::OPT_mno_reserve_frame_pointer_reg, + mustMaintainValidFrameChain(Args, Triple)); if (EnableFP) { if (EnableLeafFP) return clang::CodeGenOptions::FramePointerKind::All; - return clang::CodeGenOptions::FramePointerKind::NonLeaf; + + if (FPRegReserved) + return clang::CodeGenOptions::FramePointerKind::NonLeaf; + + return clang::CodeGenOptions::FramePointerKind::NonLeafNoReserve; } if (FPRegReserved) return clang::CodeGenOptions::FramePointerKind::Reserved; diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index 038395e4b68f2..270904de544d6 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -1071,6 +1071,9 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA, case CodeGenOptions::FramePointerKind::Reserved: FPKeepKindStr = "-mframe-pointer=reserved"; break; + case CodeGenOptions::FramePointerKind::NonLeafNoReserve: + FPKeepKindStr = "-mframe-pointer=non-leaf-no-reserve"; + break; case CodeGenOptions::FramePointerKind::NonLeaf: FPKeepKindStr = "-mframe-pointer=non-leaf"; break; diff --git a/clang/test/Driver/frame-pointer-elim.c b/clang/test/Driver/frame-pointer-elim.c index 6d719828c6a06..e68fbf529643e 100644 --- a/clang/test/Driver/frame-pointer-elim.c +++ b/clang/test/Driver/frame-pointer-elim.c @@ -2,6 +2,8 @@ // KEEP-ALL: "-mframe-pointer=all" // KEEP-NON-LEAF-NOT: warning: argument unused // KEEP-NON-LEAF: "-mframe-pointer=non-leaf" +// KEEP-NON-LEAF-NO-RESERVE-NOT: warning: argument unused +// KEEP-NON-LEAF-NO-RESERVE: "-mframe-pointer=non-leaf-no-reserve" // KEEP-NONE-NOT: warning: argument unused // KEEP-NONE: "-mframe-pointer=none" // KEEP-RESERVED-NOT: warning: argument unused @@ -24,19 +26,27 @@ // -momit-leaf-frame-pointer omits leaf frame pointer. // -fno-omit-frame-pointer loses out to -momit-leaf-frame-pointer. // RUN: %clang -### --target=i386 -S -momit-leaf-frame-pointer %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=i386-linux -S -O1 -momit-leaf-frame-pointer %s 2>&1 | \ // RUN: FileCheck --check-prefix=KEEP-NONE %s +// -momit-leaf-frame-pointer -mreserve-frame-pointer-reg results in the frame pointer reg being reserved +// RUN: %clang -### --target=i386 -S -momit-leaf-frame-pointer -mreserve-frame-pointer-reg %s 2>&1 | \ +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s + +// -fomit-frame-pointer -mreserve-frame-pointer-reg results in the frame pointer reg being reserved +// RUN: %clang -### --target=i386 -S -fomit-frame-pointer -mreserve-frame-pointer-reg %s 2>&1 | \ +// RUN: FileCheck --check-prefix=KEEP-RESERVED %s + // fno-omit-frame-pointer -momit-leaf-frame-pointer can be overwritten by // fomit-frame-pointer later on the command without warning // RUN: %clang -### --target=i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer -fomit-frame-pointer %s 2>&1 | \ // RUN: FileCheck --check-prefix=KEEP-NONE %s // RUN: %clang -### --target=i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // Explicit or default -fomit-frame-pointer wins over -mno-omit-leaf-frame-pointer. // RUN: %clang -### --target=i386 -S %s -fomit-frame-pointer -mno-omit-leaf-frame-pointer 2>&1 | \ // RUN: FileCheck --check-prefix=KEEP-NONE %s @@ -68,45 +78,45 @@ // RUN: FileCheck --check-prefix=KEEP-NONE %s // RUN: %clang -### --target=i386-darwin -S -momit-leaf-frame-pointer %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### -target armv7s-apple-ios -fomit-frame-pointer %s 2>&1 | \ // RUN: FileCheck --check-prefix=WARN-OMIT-7S %s // WARN-OMIT-7S: warning: optimization flag '-fomit-frame-pointer' is not supported for target 'armv7s' -// WARN-OMIT-7S: "-mframe-pointer=non-leaf" +// WARN-OMIT-7S: "-mframe-pointer=non-leaf-no-reserve" // RUN: %clang -### -target armv7k-apple-watchos -fomit-frame-pointer %s 2>&1 | \ // RUN: FileCheck --check-prefix=WARN-OMIT-7K %s // WARN-OMIT-7K: warning: optimization flag '-fomit-frame-pointer' is not supported for target 'armv7k' -// WARN-OMIT-7K: "-mframe-pointer=non-leaf" +// WARN-OMIT-7K: "-mframe-pointer=non-leaf-no-reserve" // RUN: %clang -### -target armv7s-apple-ios8.0 -momit-leaf-frame-pointer %s 2>&1 | \ // RUN: FileCheck --check-prefix=WARN-OMIT-LEAF-7S %s // WARN-OMIT-LEAF-7S-NOT: warning: optimization flag '-momit-leaf-frame-pointer' is not supported for target 'armv7s' -// WARN-OMIT-LEAF-7S: "-mframe-pointer=non-leaf" +// WARN-OMIT-LEAF-7S: "-mframe-pointer=non-leaf-no-reserve" // On AArch64, PS4, PS5, and VE, default to omitting the frame pointer on leaf // functions // RUN: %clang -### --target=aarch64 -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=x86_64-scei-ps4 -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=x86_64-scei-ps4 -S -O2 %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=x86_64-sie-ps5 -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=x86_64-sie-ps5 -S -O2 %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### -target aarch64-apple-darwin -arch arm64_32 -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=ve-unknown-linux-gnu -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=aarch64-linux-android -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=aarch64-linux-android -S -O2 %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=aarch64-linux-android -S -Os %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=powerpc64 -S %s 2>&1 | \ // RUN: FileCheck --check-prefix=KEEP-ALL %s @@ -161,9 +171,9 @@ // RUN: %clang -### --target=armv7a-linux-androideabi- -mthumb -mbig-endian -O1 -S %s 2>&1 | \ // RUN: FileCheck --check-prefix=KEEP-ALL %s // RUN: %clang -### --target=riscv64-linux-android -O1 -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=riscv64-linux-android -mbig-endian -O1 -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // On ARM backend bare metal targets, frame pointer is omitted // RUN: %clang -### --target=arm-arm-none-eabi -S %s 2>&1 | \ @@ -191,21 +201,21 @@ // Check that for Apple bare metal targets, we're keeping frame pointers by default // RUN: %clang -### --target=armv6m-apple-none-macho -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=armv6m-apple-none-macho -S -fno-omit-frame-pointer %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=arm-apple-none-macho -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=arm-apple-none-macho -S -fno-omit-frame-pointer %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=armv6m-apple-none-macho -S -O1 %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=armv6m-apple-none-macho -S -O1 -fno-omit-frame-pointer %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=arm-apple-none-macho -S -O1 %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=arm-apple-none-macho -S -O1 -fno-omit-frame-pointer %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang --target=armv7-apple-macho -### -S %s 2>&1 \ // RUN: -fomit-frame-pointer \ @@ -221,17 +231,22 @@ // AArch64 bare metal targets behave like hosted targets // RUN: %clang -### --target=aarch64-none-elf -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=aarch64-none-elf -S -O1 %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=aarch64-none-elf -S -fno-omit-frame-pointer %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=aarch64-none-elf -S -O1 -fno-omit-frame-pointer %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // AArch64 Windows requires that the frame pointer be reserved // RUN: %clang -### --target=aarch64-pc-windows-msvc -S -fomit-frame-pointer %s 2>&1 | \ // RUN: FileCheck --check-prefix=KEEP-RESERVED %s +// -mno-reserve-frame-pointer-reg overrides platform defaults +// But -mno-reserve-frame-pointer-reg should override the target platform default +// RUN: %clang -### --target=aarch64-pc-windows-msvc -S -fomit-frame-pointer -mno-reserve-frame-pointer-reg %s 2>&1 | \ +// RUN: FileCheck --check-prefix=KEEP-NONE %s + void f0() {} void f1() { f0(); } diff --git a/clang/test/Driver/fuchsia.c b/clang/test/Driver/fuchsia.c index 99e5018117924..0cf7535d14bd5 100644 --- a/clang/test/Driver/fuchsia.c +++ b/clang/test/Driver/fuchsia.c @@ -77,7 +77,7 @@ // RUN: %clang -### %s --target=aarch64-unknown-fuchsia -O3 2>&1 \ // RUN: | FileCheck %s -check-prefix=CHECK-FP-NONE // CHECK-FP-ALL: "-mframe-pointer=all" -// CHECK-FP-NONLEAF: "-mframe-pointer=non-leaf" +// CHECK-FP-NONLEAF: "-mframe-pointer=non-leaf-no-reserve" // CHECK-FP-NONE: "-mframe-pointer=none" // RUN: not %clang -### %s --target=x86_64-unknown-fuchsia -rtlib=libgcc 2>&1 \ diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def index dc3da7ba5c7f3..d5415faf06f47 100644 --- a/flang/include/flang/Frontend/CodeGenOptions.def +++ b/flang/include/flang/Frontend/CodeGenOptions.def @@ -54,7 +54,7 @@ CODEGENOPT(Underscoring, 1, 1) ENUM_CODEGENOPT(RelocationModel, llvm::Reloc::Model, 3, llvm::Reloc::PIC_) ///< Name of the relocation model to use. ENUM_CODEGENOPT(DebugInfo, llvm::codegenoptions::DebugInfoKind, 4, llvm::codegenoptions::NoDebugInfo) ///< Level of debug info to generate ENUM_CODEGENOPT(VecLib, llvm::driver::VectorLibrary, 4, llvm::driver::VectorLibrary::NoLibrary) ///< Vector functions library to use -ENUM_CODEGENOPT(FramePointer, llvm::FramePointerKind, 2, llvm::FramePointerKind::None) ///< Enable the usage of frame pointers +ENUM_CODEGENOPT(FramePointer, llvm::FramePointerKind, 3, llvm::FramePointerKind::None) ///< Enable the usage of frame pointers ENUM_CODEGENOPT(ComplexRange, ComplexRangeKind, 3, ComplexRangeKind::CX_Full) ///< Method for calculating complex number division ENUM_CODEGENOPT(DoConcurrentMapping, DoConcurrentMappingKind, 2, DoConcurrentMappingKind::DCMK_None) ///< Map `do concurrent` to OpenMP diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index a9bb78d690f46..893121fe01f27 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -309,6 +309,8 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, llvm::StringSwitch>(a->getValue()) .Case("none", llvm::FramePointerKind::None) .Case("non-leaf", llvm::FramePointerKind::NonLeaf) + .Case("non-leaf-no-reserve", + llvm::FramePointerKind::NonLeafNoReserve) .Case("reserved", llvm::FramePointerKind::Reserved) .Case("all", llvm::FramePointerKind::All) .Default(std::nullopt); diff --git a/flang/test/Driver/frame-pointer-forwarding.f90 b/flang/test/Driver/frame-pointer-forwarding.f90 index 9fcbd6e12f98b..7e97c98d899f1 100644 --- a/flang/test/Driver/frame-pointer-forwarding.f90 +++ b/flang/test/Driver/frame-pointer-forwarding.f90 @@ -1,12 +1,12 @@ ! Test that flang forwards -fno-omit-frame-pointer and -fomit-frame-pointer Flang frontend ! RUN: %flang --target=aarch64-none-none -fsyntax-only -### %s -o %t 2>&1 | FileCheck %s --check-prefix=CHECK-NOVALUE -! CHECK-NOVALUE: "-fc1"{{.*}}"-mframe-pointer=non-leaf" +! CHECK-NOVALUE: "-fc1"{{.*}}"-mframe-pointer=non-leaf-no-reserve" ! RUN: %flang -fomit-frame-pointer --target=aarch64-none-none -fsyntax-only -### %s -o %t 2>&1 | FileCheck %s --check-prefix=CHECK-NONEFP ! CHECK-NONEFP: "-fc1"{{.*}}"-mframe-pointer=none" ! RUN: %flang -fno-omit-frame-pointer --target=aarch64-none-none -fsyntax-only -### %s -o %t 2>&1 | FileCheck %s --check-prefix=CHECK-NONLEAFFP -! CHECK-NONLEAFFP: "-fc1"{{.*}}"-mframe-pointer=non-leaf" +! CHECK-NONLEAFFP: "-fc1"{{.*}}"-mframe-pointer=non-leaf-no-reserve" ! RUN: %flang -fno-omit-frame-pointer --target=x86-none-none -fsyntax-only -### %s -o %t 2>&1 | FileCheck %s --check-prefix=CHECK-ALLFP ! CHECK-ALLFP: "-fc1"{{.*}}"-mframe-pointer=all" diff --git a/llvm/include/llvm/Support/CodeGen.h b/llvm/include/llvm/Support/CodeGen.h index cd1f9167b996d..15df265556339 100644 --- a/llvm/include/llvm/Support/CodeGen.h +++ b/llvm/include/llvm/Support/CodeGen.h @@ -115,7 +115,13 @@ namespace llvm { }; // Specify what functions should keep the frame pointer. - enum class FramePointerKind { None, NonLeaf, All, Reserved }; + enum class FramePointerKind { + None, + NonLeaf, + All, + Reserved, + NonLeafNoReserve + }; // Specify what type of zeroing callee-used registers. namespace ZeroCallUsedRegs { diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp index c1365f499dcf5..02ae722b5a56e 100644 --- a/llvm/lib/CodeGen/CommandFlags.cpp +++ b/llvm/lib/CodeGen/CommandFlags.cpp @@ -210,6 +210,9 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { clEnumValN(FramePointerKind::All, "all", "Disable frame pointer elimination"), clEnumValN(FramePointerKind::NonLeaf, "non-leaf", + "Disable frame pointer elimination for non-leaf frame but " + "reserve the register in leaf functions"), + clEnumValN(FramePointerKind::NonLeafNoReserve, "non-leaf-no-reserve", "Disable frame pointer elimination for non-leaf frame"), clEnumValN(FramePointerKind::Reserved, "reserved", "Enable frame pointer elimination, but reserve the frame " @@ -687,6 +690,8 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features, NewAttrs.addAttribute("frame-pointer", "all"); else if (getFramePointerUsage() == FramePointerKind::NonLeaf) NewAttrs.addAttribute("frame-pointer", "non-leaf"); + else if (getFramePointerUsage() == FramePointerKind::NonLeafNoReserve) + NewAttrs.addAttribute("frame-pointer", "non-leaf-no-reserve"); else if (getFramePointerUsage() == FramePointerKind::Reserved) NewAttrs.addAttribute("frame-pointer", "reserved"); else if (getFramePointerUsage() == FramePointerKind::None) diff --git a/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/llvm/lib/CodeGen/TargetOptionsImpl.cpp index c33bf8b014b55..16d86b42db4a3 100644 --- a/llvm/lib/CodeGen/TargetOptionsImpl.cpp +++ b/llvm/lib/CodeGen/TargetOptionsImpl.cpp @@ -30,7 +30,7 @@ bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const { StringRef FP = FPAttr.getValueAsString(); if (FP == "all") return true; - if (FP == "non-leaf") + if (FP == "non-leaf" || FP == "non-leaf-no-reserve") return MF.getFrameInfo().hasCalls(); if (FP == "none" || FP == "reserved") return false; @@ -45,6 +45,7 @@ bool TargetOptions::FramePointerIsReserved(const MachineFunction &MF) const { return StringSwitch(FPAttr.getValueAsString()) .Cases({"all", "non-leaf", "reserved"}, true) + .Case(("non-leaf-no-reserve"), MF.getFrameInfo().hasCalls()) .Case("none", false); } diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index fc067459dcba3..31a294447152e 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -396,6 +396,9 @@ Function *Function::createWithDefaultAttr(FunctionType *Ty, case FramePointerKind::NonLeaf: B.addAttribute("frame-pointer", "non-leaf"); break; + case FramePointerKind::NonLeafNoReserve: + B.addAttribute("frame-pointer", "non-leaf-no-reserve"); + break; case FramePointerKind::All: B.addAttribute("frame-pointer", "all"); break; diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index a4f647409094c..45f3c1bcbf5f3 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -2496,7 +2496,8 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, if (Attribute FPAttr = Attrs.getFnAttr("frame-pointer"); FPAttr.isValid()) { StringRef FP = FPAttr.getValueAsString(); - if (FP != "all" && FP != "non-leaf" && FP != "none" && FP != "reserved") + if (FP != "all" && FP != "non-leaf" && FP != "none" && FP != "reserved" && + FP != "non-leaf-no-reserve") CheckFailed("invalid value for 'frame-pointer' attribute: " + FP, V); } diff --git a/llvm/test/CodeGen/X86/regalloc-fp.ll b/llvm/test/CodeGen/X86/regalloc-fp.ll new file mode 100644 index 0000000000000..e89e5ab1d6b59 --- /dev/null +++ b/llvm/test/CodeGen/X86/regalloc-fp.ll @@ -0,0 +1,775 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; Context: +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s +define i32 @check_none() "frame-pointer"="none" { +; CHECK-LABEL: check_none: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $4, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $5, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $6, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $7, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $9, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $16, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $17, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $18, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $19, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $20, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r9d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r10d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r11d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebp +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r14d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r12d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r13d +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r8d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r9d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r10d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r11d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebp, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r14d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r15d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r12d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r13d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %reg0 = alloca i32, align 4 + %reg1 = alloca i32, align 4 + %reg2 = alloca i32, align 4 + %reg3 = alloca i32, align 4 + %reg4 = alloca i32, align 4 + %reg5 = alloca i32, align 4 + %reg6 = alloca i32, align 4 + %reg7 = alloca i32, align 4 + %reg8 = alloca i32, align 4 + %reg9 = alloca i32, align 4 + %reg10 = alloca i32, align 4 + %reg11 = alloca i32, align 4 + %reg12 = alloca i32, align 4 + %reg13 = alloca i32, align 4 + %reg14 = alloca i32, align 4 + store volatile i32 0, ptr %reg0, align 4 + store volatile i32 1, ptr %reg1, align 4 + store volatile i32 2, ptr %reg2, align 4 + store volatile i32 3, ptr %reg3, align 4 + store volatile i32 4, ptr %reg4, align 4 + store volatile i32 5, ptr %reg5, align 4 + store volatile i32 6, ptr %reg6, align 4 + store volatile i32 7, ptr %reg7, align 4 + store volatile i32 8, ptr %reg8, align 4 + store volatile i32 9, ptr %reg9, align 4 + store volatile i32 16, ptr %reg10, align 4 + store volatile i32 17, ptr %reg11, align 4 + store volatile i32 18, ptr %reg12, align 4 + store volatile i32 19, ptr %reg13, align 4 + store volatile i32 20, ptr %reg14, align 4 + %0 = load volatile i32, ptr %reg0, align 4 + %1 = load volatile i32, ptr %reg1, align 4 + %2 = load volatile i32, ptr %reg2, align 4 + %3 = load volatile i32, ptr %reg3, align 4 + %4 = load volatile i32, ptr %reg4, align 4 + %5 = load volatile i32, ptr %reg5, align 4 + %6 = load volatile i32, ptr %reg6, align 4 + %7 = load volatile i32, ptr %reg7, align 4 + %8 = load volatile i32, ptr %reg8, align 4 + %9 = load volatile i32, ptr %reg9, align 4 + %10 = load volatile i32, ptr %reg10, align 4 + %11 = load volatile i32, ptr %reg11, align 4 + %12 = load volatile i32, ptr %reg12, align 4 + %13 = load volatile i32, ptr %reg13, align 4 + %14 = load volatile i32, ptr %reg14, align 4 + %15 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14) #1 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 0 + %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 1 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 2 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 3 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 4 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 5 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 6 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 7 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 8 + %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 9 + %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 10 + %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 11 + %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 12 + %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 13 + %asmresult14 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 14 + store volatile i32 %asmresult, ptr %reg0, align 4 + store volatile i32 %asmresult1, ptr %reg1, align 4 + store volatile i32 %asmresult2, ptr %reg2, align 4 + store volatile i32 %asmresult3, ptr %reg3, align 4 + store volatile i32 %asmresult4, ptr %reg4, align 4 + store volatile i32 %asmresult5, ptr %reg5, align 4 + store volatile i32 %asmresult6, ptr %reg6, align 4 + store volatile i32 %asmresult7, ptr %reg7, align 4 + store volatile i32 %asmresult8, ptr %reg8, align 4 + store volatile i32 %asmresult9, ptr %reg9, align 4 + store volatile i32 %asmresult10, ptr %reg10, align 4 + store volatile i32 %asmresult11, ptr %reg11, align 4 + store volatile i32 %asmresult12, ptr %reg12, align 4 + store volatile i32 %asmresult13, ptr %reg13, align 4 + store volatile i32 %asmresult14, ptr %reg14, align 4 + ret i32 0 +} + +define i32 @test_non_leaf_no_reserve() "frame-pointer"="non-leaf-no-reserve" { +; CHECK-LABEL: test_non_leaf_no_reserve: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $4, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $5, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $6, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $7, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $9, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $16, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $17, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $18, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $19, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $20, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r9d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r10d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r11d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebp +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r14d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r12d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r13d +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r8d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r9d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r10d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r11d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebp, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r14d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r15d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r12d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r13d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %reg0 = alloca i32, align 4 + %reg1 = alloca i32, align 4 + %reg2 = alloca i32, align 4 + %reg3 = alloca i32, align 4 + %reg4 = alloca i32, align 4 + %reg5 = alloca i32, align 4 + %reg6 = alloca i32, align 4 + %reg7 = alloca i32, align 4 + %reg8 = alloca i32, align 4 + %reg9 = alloca i32, align 4 + %reg10 = alloca i32, align 4 + %reg11 = alloca i32, align 4 + %reg12 = alloca i32, align 4 + %reg13 = alloca i32, align 4 + %reg14 = alloca i32, align 4 + store volatile i32 0, ptr %reg0, align 4 + store volatile i32 1, ptr %reg1, align 4 + store volatile i32 2, ptr %reg2, align 4 + store volatile i32 3, ptr %reg3, align 4 + store volatile i32 4, ptr %reg4, align 4 + store volatile i32 5, ptr %reg5, align 4 + store volatile i32 6, ptr %reg6, align 4 + store volatile i32 7, ptr %reg7, align 4 + store volatile i32 8, ptr %reg8, align 4 + store volatile i32 9, ptr %reg9, align 4 + store volatile i32 16, ptr %reg10, align 4 + store volatile i32 17, ptr %reg11, align 4 + store volatile i32 18, ptr %reg12, align 4 + store volatile i32 19, ptr %reg13, align 4 + store volatile i32 20, ptr %reg14, align 4 + %0 = load volatile i32, ptr %reg0, align 4 + %1 = load volatile i32, ptr %reg1, align 4 + %2 = load volatile i32, ptr %reg2, align 4 + %3 = load volatile i32, ptr %reg3, align 4 + %4 = load volatile i32, ptr %reg4, align 4 + %5 = load volatile i32, ptr %reg5, align 4 + %6 = load volatile i32, ptr %reg6, align 4 + %7 = load volatile i32, ptr %reg7, align 4 + %8 = load volatile i32, ptr %reg8, align 4 + %9 = load volatile i32, ptr %reg9, align 4 + %10 = load volatile i32, ptr %reg10, align 4 + %11 = load volatile i32, ptr %reg11, align 4 + %12 = load volatile i32, ptr %reg12, align 4 + %13 = load volatile i32, ptr %reg13, align 4 + %14 = load volatile i32, ptr %reg14, align 4 + %15 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14) #1 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 0 + %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 1 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 2 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 3 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 4 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 5 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 6 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 7 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 8 + %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 9 + %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 10 + %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 11 + %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 12 + %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 13 + %asmresult14 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 14 + store volatile i32 %asmresult, ptr %reg0, align 4 + store volatile i32 %asmresult1, ptr %reg1, align 4 + store volatile i32 %asmresult2, ptr %reg2, align 4 + store volatile i32 %asmresult3, ptr %reg3, align 4 + store volatile i32 %asmresult4, ptr %reg4, align 4 + store volatile i32 %asmresult5, ptr %reg5, align 4 + store volatile i32 %asmresult6, ptr %reg6, align 4 + store volatile i32 %asmresult7, ptr %reg7, align 4 + store volatile i32 %asmresult8, ptr %reg8, align 4 + store volatile i32 %asmresult9, ptr %reg9, align 4 + store volatile i32 %asmresult10, ptr %reg10, align 4 + store volatile i32 %asmresult11, ptr %reg11, align 4 + store volatile i32 %asmresult12, ptr %reg12, align 4 + store volatile i32 %asmresult13, ptr %reg13, align 4 + store volatile i32 %asmresult14, ptr %reg14, align 4 + ret i32 0 +} + +define i32 @test_non_leaf() "frame-pointer"="non-leaf" { +; CHECK-LABEL: test_non_leaf: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset %rbx, -48 +; CHECK-NEXT: .cfi_offset %r12, -40 +; CHECK-NEXT: .cfi_offset %r13, -32 +; CHECK-NEXT: .cfi_offset %r14, -24 +; CHECK-NEXT: .cfi_offset %r15, -16 +; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $4, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $5, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $6, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $7, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $9, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $16, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $17, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $18, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $19, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r9d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r10d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r11d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r14d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r12d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r13d +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r8d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r9d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r10d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r11d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r14d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r15d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r12d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r13d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %reg0 = alloca i32, align 4 + %reg1 = alloca i32, align 4 + %reg2 = alloca i32, align 4 + %reg3 = alloca i32, align 4 + %reg4 = alloca i32, align 4 + %reg5 = alloca i32, align 4 + %reg6 = alloca i32, align 4 + %reg7 = alloca i32, align 4 + %reg8 = alloca i32, align 4 + %reg9 = alloca i32, align 4 + %reg10 = alloca i32, align 4 + %reg11 = alloca i32, align 4 + %reg12 = alloca i32, align 4 + %reg13 = alloca i32, align 4 + store volatile i32 0, ptr %reg0, align 4 + store volatile i32 1, ptr %reg1, align 4 + store volatile i32 2, ptr %reg2, align 4 + store volatile i32 3, ptr %reg3, align 4 + store volatile i32 4, ptr %reg4, align 4 + store volatile i32 5, ptr %reg5, align 4 + store volatile i32 6, ptr %reg6, align 4 + store volatile i32 7, ptr %reg7, align 4 + store volatile i32 8, ptr %reg8, align 4 + store volatile i32 9, ptr %reg9, align 4 + store volatile i32 16, ptr %reg10, align 4 + store volatile i32 17, ptr %reg11, align 4 + store volatile i32 18, ptr %reg12, align 4 + store volatile i32 19, ptr %reg13, align 4 + %0 = load volatile i32, ptr %reg0, align 4 + %1 = load volatile i32, ptr %reg1, align 4 + %2 = load volatile i32, ptr %reg2, align 4 + %3 = load volatile i32, ptr %reg3, align 4 + %4 = load volatile i32, ptr %reg4, align 4 + %5 = load volatile i32, ptr %reg5, align 4 + %6 = load volatile i32, ptr %reg6, align 4 + %7 = load volatile i32, ptr %reg7, align 4 + %8 = load volatile i32, ptr %reg8, align 4 + %9 = load volatile i32, ptr %reg9, align 4 + %10 = load volatile i32, ptr %reg10, align 4 + %11 = load volatile i32, ptr %reg11, align 4 + %12 = load volatile i32, ptr %reg12, align 4 + %13 = load volatile i32, ptr %reg13, align 4 + %14 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #1 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 0 + %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 1 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 2 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 3 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 4 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 5 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 6 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 7 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 8 + %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 9 + %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 10 + %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 11 + %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 12 + %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 13 + store volatile i32 %asmresult, ptr %reg0, align 4 + store volatile i32 %asmresult1, ptr %reg1, align 4 + store volatile i32 %asmresult2, ptr %reg2, align 4 + store volatile i32 %asmresult3, ptr %reg3, align 4 + store volatile i32 %asmresult4, ptr %reg4, align 4 + store volatile i32 %asmresult5, ptr %reg5, align 4 + store volatile i32 %asmresult6, ptr %reg6, align 4 + store volatile i32 %asmresult7, ptr %reg7, align 4 + store volatile i32 %asmresult8, ptr %reg8, align 4 + store volatile i32 %asmresult9, ptr %reg9, align 4 + store volatile i32 %asmresult10, ptr %reg10, align 4 + store volatile i32 %asmresult11, ptr %reg11, align 4 + store volatile i32 %asmresult12, ptr %reg12, align 4 + store volatile i32 %asmresult13, ptr %reg13, align 4 + ret i32 0 +} + +define i32 @test_reserved() "frame-pointer"="reserved" { +; CHECK-LABEL: test_reserved: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset %rbx, -48 +; CHECK-NEXT: .cfi_offset %r12, -40 +; CHECK-NEXT: .cfi_offset %r13, -32 +; CHECK-NEXT: .cfi_offset %r14, -24 +; CHECK-NEXT: .cfi_offset %r15, -16 +; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $4, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $5, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $6, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $7, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $9, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $16, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $17, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $18, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $19, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r9d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r10d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r11d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r14d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r12d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r13d +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r8d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r9d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r10d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r11d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r14d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r15d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r12d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r13d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %reg0 = alloca i32, align 4 + %reg1 = alloca i32, align 4 + %reg2 = alloca i32, align 4 + %reg3 = alloca i32, align 4 + %reg4 = alloca i32, align 4 + %reg5 = alloca i32, align 4 + %reg6 = alloca i32, align 4 + %reg7 = alloca i32, align 4 + %reg8 = alloca i32, align 4 + %reg9 = alloca i32, align 4 + %reg10 = alloca i32, align 4 + %reg11 = alloca i32, align 4 + %reg12 = alloca i32, align 4 + %reg13 = alloca i32, align 4 + store volatile i32 0, ptr %reg0, align 4 + store volatile i32 1, ptr %reg1, align 4 + store volatile i32 2, ptr %reg2, align 4 + store volatile i32 3, ptr %reg3, align 4 + store volatile i32 4, ptr %reg4, align 4 + store volatile i32 5, ptr %reg5, align 4 + store volatile i32 6, ptr %reg6, align 4 + store volatile i32 7, ptr %reg7, align 4 + store volatile i32 8, ptr %reg8, align 4 + store volatile i32 9, ptr %reg9, align 4 + store volatile i32 16, ptr %reg10, align 4 + store volatile i32 17, ptr %reg11, align 4 + store volatile i32 18, ptr %reg12, align 4 + store volatile i32 19, ptr %reg13, align 4 + %0 = load volatile i32, ptr %reg0, align 4 + %1 = load volatile i32, ptr %reg1, align 4 + %2 = load volatile i32, ptr %reg2, align 4 + %3 = load volatile i32, ptr %reg3, align 4 + %4 = load volatile i32, ptr %reg4, align 4 + %5 = load volatile i32, ptr %reg5, align 4 + %6 = load volatile i32, ptr %reg6, align 4 + %7 = load volatile i32, ptr %reg7, align 4 + %8 = load volatile i32, ptr %reg8, align 4 + %9 = load volatile i32, ptr %reg9, align 4 + %10 = load volatile i32, ptr %reg10, align 4 + %11 = load volatile i32, ptr %reg11, align 4 + %12 = load volatile i32, ptr %reg12, align 4 + %13 = load volatile i32, ptr %reg13, align 4 + %14 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #1 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 0 + %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 1 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 2 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 3 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 4 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 5 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 6 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 7 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 8 + %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 9 + %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 10 + %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 11 + %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 12 + %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 13 + store volatile i32 %asmresult, ptr %reg0, align 4 + store volatile i32 %asmresult1, ptr %reg1, align 4 + store volatile i32 %asmresult2, ptr %reg2, align 4 + store volatile i32 %asmresult3, ptr %reg3, align 4 + store volatile i32 %asmresult4, ptr %reg4, align 4 + store volatile i32 %asmresult5, ptr %reg5, align 4 + store volatile i32 %asmresult6, ptr %reg6, align 4 + store volatile i32 %asmresult7, ptr %reg7, align 4 + store volatile i32 %asmresult8, ptr %reg8, align 4 + store volatile i32 %asmresult9, ptr %reg9, align 4 + store volatile i32 %asmresult10, ptr %reg10, align 4 + store volatile i32 %asmresult11, ptr %reg11, align 4 + store volatile i32 %asmresult12, ptr %reg12, align 4 + store volatile i32 %asmresult13, ptr %reg13, align 4 + ret i32 0 +} + +define i32 @test_all() "frame-pointer"="all" { +; CHECK-LABEL: test_all: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: movl $0, -96(%rbp) +; CHECK-NEXT: movl $1, -92(%rbp) +; CHECK-NEXT: movl $2, -88(%rbp) +; CHECK-NEXT: movl $3, -84(%rbp) +; CHECK-NEXT: movl $4, -80(%rbp) +; CHECK-NEXT: movl $5, -76(%rbp) +; CHECK-NEXT: movl $6, -72(%rbp) +; CHECK-NEXT: movl $7, -68(%rbp) +; CHECK-NEXT: movl $8, -64(%rbp) +; CHECK-NEXT: movl $9, -60(%rbp) +; CHECK-NEXT: movl $16, -56(%rbp) +; CHECK-NEXT: movl $17, -52(%rbp) +; CHECK-NEXT: movl $18, -48(%rbp) +; CHECK-NEXT: movl $19, -44(%rbp) +; CHECK-NEXT: movl -96(%rbp), %eax +; CHECK-NEXT: movl -92(%rbp), %ecx +; CHECK-NEXT: movl -88(%rbp), %edx +; CHECK-NEXT: movl -84(%rbp), %esi +; CHECK-NEXT: movl -80(%rbp), %edi +; CHECK-NEXT: movl -76(%rbp), %r8d +; CHECK-NEXT: movl -72(%rbp), %r9d +; CHECK-NEXT: movl -68(%rbp), %r10d +; CHECK-NEXT: movl -64(%rbp), %r11d +; CHECK-NEXT: movl -60(%rbp), %ebx +; CHECK-NEXT: movl -56(%rbp), %r14d +; CHECK-NEXT: movl -52(%rbp), %r15d +; CHECK-NEXT: movl -48(%rbp), %r12d +; CHECK-NEXT: movl -44(%rbp), %r13d +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %eax, -96(%rbp) +; CHECK-NEXT: movl %ecx, -92(%rbp) +; CHECK-NEXT: movl %edx, -88(%rbp) +; CHECK-NEXT: movl %esi, -84(%rbp) +; CHECK-NEXT: movl %edi, -80(%rbp) +; CHECK-NEXT: movl %r8d, -76(%rbp) +; CHECK-NEXT: movl %r9d, -72(%rbp) +; CHECK-NEXT: movl %r10d, -68(%rbp) +; CHECK-NEXT: movl %r11d, -64(%rbp) +; CHECK-NEXT: movl %ebx, -60(%rbp) +; CHECK-NEXT: movl %r14d, -56(%rbp) +; CHECK-NEXT: movl %r15d, -52(%rbp) +; CHECK-NEXT: movl %r12d, -48(%rbp) +; CHECK-NEXT: movl %r13d, -44(%rbp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq +entry: + %reg0 = alloca i32, align 4 + %reg1 = alloca i32, align 4 + %reg2 = alloca i32, align 4 + %reg3 = alloca i32, align 4 + %reg4 = alloca i32, align 4 + %reg5 = alloca i32, align 4 + %reg6 = alloca i32, align 4 + %reg7 = alloca i32, align 4 + %reg8 = alloca i32, align 4 + %reg9 = alloca i32, align 4 + %reg10 = alloca i32, align 4 + %reg11 = alloca i32, align 4 + %reg12 = alloca i32, align 4 + %reg13 = alloca i32, align 4 + store volatile i32 0, ptr %reg0, align 4 + store volatile i32 1, ptr %reg1, align 4 + store volatile i32 2, ptr %reg2, align 4 + store volatile i32 3, ptr %reg3, align 4 + store volatile i32 4, ptr %reg4, align 4 + store volatile i32 5, ptr %reg5, align 4 + store volatile i32 6, ptr %reg6, align 4 + store volatile i32 7, ptr %reg7, align 4 + store volatile i32 8, ptr %reg8, align 4 + store volatile i32 9, ptr %reg9, align 4 + store volatile i32 16, ptr %reg10, align 4 + store volatile i32 17, ptr %reg11, align 4 + store volatile i32 18, ptr %reg12, align 4 + store volatile i32 19, ptr %reg13, align 4 + %0 = load volatile i32, ptr %reg0, align 4 + %1 = load volatile i32, ptr %reg1, align 4 + %2 = load volatile i32, ptr %reg2, align 4 + %3 = load volatile i32, ptr %reg3, align 4 + %4 = load volatile i32, ptr %reg4, align 4 + %5 = load volatile i32, ptr %reg5, align 4 + %6 = load volatile i32, ptr %reg6, align 4 + %7 = load volatile i32, ptr %reg7, align 4 + %8 = load volatile i32, ptr %reg8, align 4 + %9 = load volatile i32, ptr %reg9, align 4 + %10 = load volatile i32, ptr %reg10, align 4 + %11 = load volatile i32, ptr %reg11, align 4 + %12 = load volatile i32, ptr %reg12, align 4 + %13 = load volatile i32, ptr %reg13, align 4 + %14 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #1 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 0 + %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 1 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 2 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 3 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 4 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 5 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 6 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 7 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 8 + %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 9 + %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 10 + %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 11 + %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 12 + %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 13 + store volatile i32 %asmresult, ptr %reg0, align 4 + store volatile i32 %asmresult1, ptr %reg1, align 4 + store volatile i32 %asmresult2, ptr %reg2, align 4 + store volatile i32 %asmresult3, ptr %reg3, align 4 + store volatile i32 %asmresult4, ptr %reg4, align 4 + store volatile i32 %asmresult5, ptr %reg5, align 4 + store volatile i32 %asmresult6, ptr %reg6, align 4 + store volatile i32 %asmresult7, ptr %reg7, align 4 + store volatile i32 %asmresult8, ptr %reg8, align 4 + store volatile i32 %asmresult9, ptr %reg9, align 4 + store volatile i32 %asmresult10, ptr %reg10, align 4 + store volatile i32 %asmresult11, ptr %reg11, align 4 + store volatile i32 %asmresult12, ptr %reg12, align 4 + store volatile i32 %asmresult13, ptr %reg13, align 4 + ret i32 0 +} diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td index e7b44fd94b26d..e2edab44153ca 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td @@ -758,13 +758,16 @@ def FramePointerKindAll : LLVM_EnumAttrCase<"All", "all", "All", 2>; def FramePointerKindReserved : LLVM_EnumAttrCase<"Reserved", "reserved", "Reserved", 3>; +def FramePointerKindNonLeafNoReserve + : LLVM_EnumAttrCase<"NonLeafNoReserve", "non-leaf-no-reserve", "NonLeafNoReserve", 4>; def FramePointerKindEnum : LLVM_EnumAttr< "FramePointerKind", "::llvm::FramePointerKind", "LLVM FramePointerKind", [FramePointerKindNone, FramePointerKindNonLeaf, - FramePointerKindAll, FramePointerKindReserved]> { + FramePointerKindAll, FramePointerKindReserved, + FramePointerKindNonLeafNoReserve]> { let cppNamespace = "::mlir::LLVM::framePointerKind"; } From e67ac07881c215c91fe1ec714be6f3582178073c Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Tue, 11 Nov 2025 12:29:09 -0500 Subject: [PATCH 04/64] Revert "Reapply "Reapply "[mlir] Add FP software implementation lowering pass: `arith-to-apfloat` (#166618)" (#167431)"" (#167549) Reverts llvm/llvm-project#167436 to fix sanitizers --- .../ArithToAPFloat/ArithToAPFloat.h | 21 --- mlir/include/mlir/Conversion/Passes.h | 1 - mlir/include/mlir/Conversion/Passes.td | 15 -- mlir/include/mlir/Dialect/Func/Utils/Utils.h | 7 - .../mlir/Dialect/LLVMIR/FunctionCallUtils.h | 4 - .../ArithToAPFloat/ArithToAPFloat.cpp | 158 ------------------ .../Conversion/ArithToAPFloat/CMakeLists.txt | 18 -- .../Conversion/ArithToLLVM/ArithToLLVM.cpp | 1 - mlir/lib/Conversion/CMakeLists.txt | 1 - .../VectorToLLVM/ConvertVectorToLLVM.cpp | 14 -- mlir/lib/Dialect/Func/Utils/Utils.cpp | 25 --- .../Dialect/LLVMIR/IR/FunctionCallUtils.cpp | 11 -- mlir/lib/ExecutionEngine/APFloatWrappers.cpp | 89 ---------- mlir/lib/ExecutionEngine/CMakeLists.txt | 12 -- .../ArithToApfloat/arith-to-apfloat.mlir | 128 -------------- .../Arith/CPU/test-apfloat-emulation.mlir | 36 ---- mlir/test/lit.cfg.py | 1 - 17 files changed, 542 deletions(-) delete mode 100644 mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h delete mode 100644 mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp delete mode 100644 mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt delete mode 100644 mlir/lib/ExecutionEngine/APFloatWrappers.cpp delete mode 100644 mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir delete mode 100644 mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir diff --git a/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h b/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h deleted file mode 100644 index 64a42a228199e..0000000000000 --- a/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h +++ /dev/null @@ -1,21 +0,0 @@ -//===- ArithToAPFloat.h - Arith to APFloat impl conversion ---*- C++ ----*-===// -// -// Part of the APFloat Project, under the Apache License v2.0 with APFloat -// Exceptions. See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH APFloat-exception -// -//===----------------------------------------------------------------------===// - -#ifndef MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H -#define MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H - -#include - -namespace mlir { -class Pass; - -#define GEN_PASS_DECL_ARITHTOAPFLOATCONVERSIONPASS -#include "mlir/Conversion/Passes.h.inc" -} // namespace mlir - -#endif // MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h index 82bdfd02661a6..40d866ec7bf10 100644 --- a/mlir/include/mlir/Conversion/Passes.h +++ b/mlir/include/mlir/Conversion/Passes.h @@ -12,7 +12,6 @@ #include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h" #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" #include "mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h" -#include "mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h" #include "mlir/Conversion/ArithToArmSME/ArithToArmSME.h" #include "mlir/Conversion/ArithToEmitC/ArithToEmitCPass.h" #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index 79bc380dbcb7a..70e3e45c225db 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -186,21 +186,6 @@ def ArithToLLVMConversionPass : Pass<"convert-arith-to-llvm"> { ]; } -//===----------------------------------------------------------------------===// -// ArithToAPFloat -//===----------------------------------------------------------------------===// - -def ArithToAPFloatConversionPass - : Pass<"convert-arith-to-apfloat", "ModuleOp"> { - let summary = "Convert Arith ops to APFloat runtime library calls"; - let description = [{ - This pass converts supported Arith ops to APFloat-based runtime library - calls (APFloatWrappers.cpp). APFloat is a software implementation of - floating-point arithmetic operations. - }]; - let dependentDialects = ["func::FuncDialect"]; -} - //===----------------------------------------------------------------------===// // ArithToSPIRV //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/Func/Utils/Utils.h b/mlir/include/mlir/Dialect/Func/Utils/Utils.h index 00d50874a2e8d..3576126a487ac 100644 --- a/mlir/include/mlir/Dialect/Func/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/Func/Utils/Utils.h @@ -60,13 +60,6 @@ mlir::FailureOr> deduplicateArgsOfFuncOp(mlir::RewriterBase &rewriter, mlir::func::FuncOp funcOp, mlir::ModuleOp moduleOp); -/// Look up a FuncOp with signature `resultTypes`(`paramTypes`)` and name -/// `name`. Return a failure if the FuncOp is found but with a different -/// signature. -FailureOr lookupFnDecl(SymbolOpInterface symTable, StringRef name, - FunctionType funcT, - SymbolTableCollection *symbolTables = nullptr); - } // namespace func } // namespace mlir diff --git a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h index b09d32022e348..8ad9ed18acebd 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h +++ b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h @@ -52,10 +52,6 @@ lookupOrCreatePrintF32Fn(OpBuilder &b, Operation *moduleOp, FailureOr lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp, SymbolTableCollection *symbolTables = nullptr); -FailureOr -lookupOrCreateApFloatPrintFn(OpBuilder &b, Operation *moduleOp, - SymbolTableCollection *symbolTables = nullptr); - /// Declares a function to print a C-string. /// If a custom runtime function is defined via `runtimeFunctionName`, it must /// have the signature void(char const*). The default function is `printString`. diff --git a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp deleted file mode 100644 index 01fd5b278aca4..0000000000000 --- a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp +++ /dev/null @@ -1,158 +0,0 @@ -//===- ArithToAPFloat.cpp - Arithmetic to APFloat Conversion --------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h" - -#include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/Arith/Transforms/Passes.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/Func/Utils/Utils.h" -#include "mlir/IR/PatternMatch.h" -#include "mlir/IR/Verifier.h" -#include "mlir/Transforms/WalkPatternRewriteDriver.h" - -namespace mlir { -#define GEN_PASS_DEF_ARITHTOAPFLOATCONVERSIONPASS -#include "mlir/Conversion/Passes.h.inc" -} // namespace mlir - -using namespace mlir; -using namespace mlir::func; - -static FuncOp createFnDecl(OpBuilder &b, SymbolOpInterface symTable, - StringRef name, FunctionType funcT, bool setPrivate, - SymbolTableCollection *symbolTables = nullptr) { - OpBuilder::InsertionGuard g(b); - assert(!symTable->getRegion(0).empty() && "expected non-empty region"); - b.setInsertionPointToStart(&symTable->getRegion(0).front()); - FuncOp funcOp = FuncOp::create(b, symTable->getLoc(), name, funcT); - if (setPrivate) - funcOp.setPrivate(); - if (symbolTables) { - SymbolTable &symbolTable = symbolTables->getSymbolTable(symTable); - symbolTable.insert(funcOp, symTable->getRegion(0).front().begin()); - } - return funcOp; -} - -/// Helper function to look up or create the symbol for a runtime library -/// function for a binary arithmetic operation. -/// -/// Parameter 1: APFloat semantics -/// Parameter 2: Left-hand side operand -/// Parameter 3: Right-hand side operand -/// -/// This function will return a failure if the function is found but has an -/// unexpected signature. -/// -static FailureOr -lookupOrCreateBinaryFn(OpBuilder &b, SymbolOpInterface symTable, StringRef name, - SymbolTableCollection *symbolTables = nullptr) { - auto i32Type = IntegerType::get(symTable->getContext(), 32); - auto i64Type = IntegerType::get(symTable->getContext(), 64); - - std::string funcName = (llvm::Twine("_mlir_apfloat_") + name).str(); - FunctionType funcT = - FunctionType::get(b.getContext(), {i32Type, i64Type, i64Type}, {i64Type}); - FailureOr func = - lookupFnDecl(symTable, funcName, funcT, symbolTables); - // Failed due to type mismatch. - if (failed(func)) - return func; - // Successfully matched existing decl. - if (*func) - return *func; - - return createFnDecl(b, symTable, funcName, funcT, - /*setPrivate=*/true, symbolTables); -} - -/// Rewrite a binary arithmetic operation to an APFloat function call. -template -struct BinaryArithOpToAPFloatConversion final : OpRewritePattern { - BinaryArithOpToAPFloatConversion(MLIRContext *context, PatternBenefit benefit, - SymbolOpInterface symTable) - : OpRewritePattern(context, benefit), symTable(symTable) {}; - - LogicalResult matchAndRewrite(OpTy op, - PatternRewriter &rewriter) const override { - // Get APFloat function from runtime library. - FailureOr fn = - lookupOrCreateBinaryFn(rewriter, symTable, APFloatName); - if (failed(fn)) - return fn; - - rewriter.setInsertionPoint(op); - // Cast operands to 64-bit integers. - Location loc = op.getLoc(); - auto floatTy = cast(op.getType()); - auto intWType = rewriter.getIntegerType(floatTy.getWidth()); - auto int64Type = rewriter.getI64Type(); - Value lhsBits = arith::ExtUIOp::create( - rewriter, loc, int64Type, - arith::BitcastOp::create(rewriter, loc, intWType, op.getLhs())); - Value rhsBits = arith::ExtUIOp::create( - rewriter, loc, int64Type, - arith::BitcastOp::create(rewriter, loc, intWType, op.getRhs())); - - // Call APFloat function. - int32_t sem = - llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics()); - Value semValue = arith::ConstantOp::create( - rewriter, loc, rewriter.getI32Type(), - rewriter.getIntegerAttr(rewriter.getI32Type(), sem)); - SmallVector params = {semValue, lhsBits, rhsBits}; - auto resultOp = - func::CallOp::create(rewriter, loc, TypeRange(rewriter.getI64Type()), - SymbolRefAttr::get(*fn), params); - - // Truncate result to the original width. - Value truncatedBits = arith::TruncIOp::create(rewriter, loc, intWType, - resultOp->getResult(0)); - rewriter.replaceOp( - op, arith::BitcastOp::create(rewriter, loc, floatTy, truncatedBits)); - return success(); - } - - SymbolOpInterface symTable; -}; - -namespace { -struct ArithToAPFloatConversionPass final - : impl::ArithToAPFloatConversionPassBase { - using Base::Base; - - void runOnOperation() override { - MLIRContext *context = &getContext(); - RewritePatternSet patterns(context); - static const char add[] = "add"; - static const char subtract[] = "subtract"; - static const char multiply[] = "multiply"; - static const char divide[] = "divide"; - static const char remainder[] = "remainder"; - patterns.add, - BinaryArithOpToAPFloatConversion, - BinaryArithOpToAPFloatConversion, - BinaryArithOpToAPFloatConversion, - BinaryArithOpToAPFloatConversion>( - context, 1, getOperation()); - LogicalResult result = success(); - ScopedDiagnosticHandler scopedHandler(context, [&result](Diagnostic &diag) { - if (diag.getSeverity() == DiagnosticSeverity::Error) { - result = failure(); - } - // NB: if you don't return failure, no other diag handlers will fire (see - // mlir/lib/IR/Diagnostics.cpp:DiagnosticEngineImpl::emit). - return failure(); - }); - walkAndApplyPatterns(getOperation(), std::move(patterns)); - if (failed(result)) - return signalPassFailure(); - } -}; -} // namespace diff --git a/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt b/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt deleted file mode 100644 index b5ec49c087163..0000000000000 --- a/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -add_mlir_conversion_library(MLIRArithToAPFloat - ArithToAPFloat.cpp - - ADDITIONAL_HEADER_DIRS - ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/ArithToLLVM - - DEPENDS - MLIRConversionPassIncGen - - LINK_COMPONENTS - Core - - LINK_LIBS PUBLIC - MLIRArithDialect - MLIRArithTransforms - MLIRFuncDialect - MLIRFuncUtils - ) diff --git a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp index f2bacc3399144..b6099902cc337 100644 --- a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp +++ b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp @@ -14,7 +14,6 @@ #include "mlir/Conversion/LLVMCommon/VectorPattern.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Arith/Transforms/Passes.h" -#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h" #include "mlir/Dialect/LLVMIR/LLVMAttrs.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/IR/TypeUtilities.h" diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt index 613dc6d242ceb..bebf1b8fff3f9 100644 --- a/mlir/lib/Conversion/CMakeLists.txt +++ b/mlir/lib/Conversion/CMakeLists.txt @@ -2,7 +2,6 @@ add_subdirectory(AffineToStandard) add_subdirectory(AMDGPUToROCDL) add_subdirectory(ArithCommon) add_subdirectory(ArithToAMDGPU) -add_subdirectory(ArithToAPFloat) add_subdirectory(ArithToArmSME) add_subdirectory(ArithToEmitC) add_subdirectory(ArithToLLVM) diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp index c747e1b59558a..69a317ecd101f 100644 --- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp +++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp @@ -1654,20 +1654,6 @@ class VectorPrintOpConversion : public ConvertOpToLLVMPattern { return failure(); } } - } else if (auto floatTy = dyn_cast(printType)) { - // Print other floating-point types using the APFloat runtime library. - int32_t sem = - llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics()); - Value semValue = LLVM::ConstantOp::create( - rewriter, loc, rewriter.getI32Type(), - rewriter.getIntegerAttr(rewriter.getI32Type(), sem)); - Value floatBits = - LLVM::ZExtOp::create(rewriter, loc, rewriter.getI64Type(), value); - printer = - LLVM::lookupOrCreateApFloatPrintFn(rewriter, parent, symbolTables); - emitCall(rewriter, loc, printer.value(), - ValueRange({semValue, floatBits})); - return success(); } else { return failure(); } diff --git a/mlir/lib/Dialect/Func/Utils/Utils.cpp b/mlir/lib/Dialect/Func/Utils/Utils.cpp index d6dfd0229963c..b4cb0932ef631 100644 --- a/mlir/lib/Dialect/Func/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Func/Utils/Utils.cpp @@ -254,28 +254,3 @@ func::deduplicateArgsOfFuncOp(RewriterBase &rewriter, func::FuncOp funcOp, return std::make_pair(*newFuncOpOrFailure, newCallOp); } - -FailureOr -func::lookupFnDecl(SymbolOpInterface symTable, StringRef name, - FunctionType funcT, SymbolTableCollection *symbolTables) { - FuncOp func; - if (symbolTables) { - func = symbolTables->lookupSymbolIn( - symTable, StringAttr::get(symTable->getContext(), name)); - } else { - func = llvm::dyn_cast_or_null( - SymbolTable::lookupSymbolIn(symTable, name)); - } - - if (!func) - return func; - - mlir::FunctionType foundFuncT = func.getFunctionType(); - // Assert the signature of the found function is same as expected - if (funcT != foundFuncT) { - return func.emitError("matched function '") - << name << "' but with different type: " << foundFuncT - << " (expected " << funcT << ")"; - } - return func; -} diff --git a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp index 160b6ae89215c..feaffa34897b6 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp @@ -30,7 +30,6 @@ static constexpr llvm::StringRef kPrintF16 = "printF16"; static constexpr llvm::StringRef kPrintBF16 = "printBF16"; static constexpr llvm::StringRef kPrintF32 = "printF32"; static constexpr llvm::StringRef kPrintF64 = "printF64"; -static constexpr llvm::StringRef kPrintApFloat = "printApFloat"; static constexpr llvm::StringRef kPrintString = "printString"; static constexpr llvm::StringRef kPrintOpen = "printOpen"; static constexpr llvm::StringRef kPrintClose = "printClose"; @@ -161,16 +160,6 @@ mlir::LLVM::lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp, LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables); } -FailureOr -mlir::LLVM::lookupOrCreateApFloatPrintFn(OpBuilder &b, Operation *moduleOp, - SymbolTableCollection *symbolTables) { - return lookupOrCreateReservedFn( - b, moduleOp, kPrintApFloat, - {IntegerType::get(moduleOp->getContext(), 32), - IntegerType::get(moduleOp->getContext(), 64)}, - LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables); -} - static LLVM::LLVMPointerType getCharPtr(MLIRContext *context) { return LLVM::LLVMPointerType::get(context); } diff --git a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp deleted file mode 100644 index 0a05f7369e556..0000000000000 --- a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp +++ /dev/null @@ -1,89 +0,0 @@ -//===- APFloatWrappers.cpp - Software Implementation of FP Arithmetics --- ===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file exposes the APFloat infrastructure to MLIR programs as a runtime -// library. APFloat is a software implementation of floating point arithmetics. -// -// On the MLIR side, floating-point values must be bitcasted to 64-bit integers -// before calling a runtime function. If a floating-point type has less than -// 64 bits, it must be zero-extended to 64 bits after bitcasting it to an -// integer. -// -// Runtime functions receive the floating-point operands of the arithmeic -// operation in the form of 64-bit integers, along with the APFloat semantics -// in the form of a 32-bit integer, which will be interpreted as an -// APFloatBase::Semantics enum value. -// -#include "llvm/ADT/APFloat.h" - -#ifdef _WIN32 -#ifndef MLIR_APFLOAT_WRAPPERS_EXPORT -#ifdef mlir_apfloat_wrappers_EXPORTS -// We are building this library -#define MLIR_APFLOAT_WRAPPERS_EXPORT __declspec(dllexport) -#else -// We are using this library -#define MLIR_APFLOAT_WRAPPERS_EXPORT __declspec(dllimport) -#endif // mlir_apfloat_wrappers_EXPORTS -#endif // MLIR_APFLOAT_WRAPPERS_EXPORT -#else -// Non-windows: use visibility attributes. -#define MLIR_APFLOAT_WRAPPERS_EXPORT __attribute__((visibility("default"))) -#endif // _WIN32 - -/// Binary operations without rounding mode. -#define APFLOAT_BINARY_OP(OP) \ - MLIR_APFLOAT_WRAPPERS_EXPORT int64_t _mlir_apfloat_##OP( \ - int32_t semantics, uint64_t a, uint64_t b) { \ - const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics( \ - static_cast(semantics)); \ - unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem); \ - llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a)); \ - llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b)); \ - lhs.OP(rhs); \ - return lhs.bitcastToAPInt().getZExtValue(); \ - } - -/// Binary operations with rounding mode. -#define APFLOAT_BINARY_OP_ROUNDING_MODE(OP, ROUNDING_MODE) \ - MLIR_APFLOAT_WRAPPERS_EXPORT int64_t _mlir_apfloat_##OP( \ - int32_t semantics, uint64_t a, uint64_t b) { \ - const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics( \ - static_cast(semantics)); \ - unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem); \ - llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a)); \ - llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b)); \ - lhs.OP(rhs, ROUNDING_MODE); \ - return lhs.bitcastToAPInt().getZExtValue(); \ - } - -extern "C" { - -#define BIN_OPS_WITH_ROUNDING(X) \ - X(add, llvm::RoundingMode::NearestTiesToEven) \ - X(subtract, llvm::RoundingMode::NearestTiesToEven) \ - X(multiply, llvm::RoundingMode::NearestTiesToEven) \ - X(divide, llvm::RoundingMode::NearestTiesToEven) - -BIN_OPS_WITH_ROUNDING(APFLOAT_BINARY_OP_ROUNDING_MODE) -#undef BIN_OPS_WITH_ROUNDING -#undef APFLOAT_BINARY_OP_ROUNDING_MODE - -APFLOAT_BINARY_OP(remainder) - -#undef APFLOAT_BINARY_OP - -MLIR_APFLOAT_WRAPPERS_EXPORT void printApFloat(int32_t semantics, uint64_t a) { - const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics( - static_cast(semantics)); - unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem); - llvm::APFloat x(sem, llvm::APInt(bitWidth, a)); - double d = x.convertToDouble(); - fprintf(stdout, "%lg", d); -} -} diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt index 8c09e50e4de7b..fdeb4dacf9278 100644 --- a/mlir/lib/ExecutionEngine/CMakeLists.txt +++ b/mlir/lib/ExecutionEngine/CMakeLists.txt @@ -2,7 +2,6 @@ # is a big dependency which most don't need. set(LLVM_OPTIONAL_SOURCES - APFloatWrappers.cpp ArmRunnerUtils.cpp ArmSMEStubs.cpp AsyncRuntime.cpp @@ -168,15 +167,6 @@ if(LLVM_ENABLE_PIC) set_property(TARGET mlir_float16_utils PROPERTY CXX_STANDARD 17) target_compile_definitions(mlir_float16_utils PRIVATE mlir_float16_utils_EXPORTS) - add_mlir_library(mlir_apfloat_wrappers - SHARED - APFloatWrappers.cpp - - EXCLUDE_FROM_LIBMLIR - ) - set_property(TARGET mlir_apfloat_wrappers PROPERTY CXX_STANDARD 17) - target_compile_definitions(mlir_apfloat_wrappers PRIVATE mlir_apfloat_wrappers_EXPORTS) - add_subdirectory(SparseTensor) add_mlir_library(mlir_c_runner_utils @@ -187,7 +177,6 @@ if(LLVM_ENABLE_PIC) EXCLUDE_FROM_LIBMLIR LINK_LIBS PUBLIC - mlir_apfloat_wrappers mlir_float16_utils MLIRSparseTensorEnums MLIRSparseTensorRuntime @@ -202,7 +191,6 @@ if(LLVM_ENABLE_PIC) EXCLUDE_FROM_LIBMLIR LINK_LIBS PUBLIC - mlir_apfloat_wrappers mlir_float16_utils ) target_compile_definitions(mlir_runner_utils PRIVATE mlir_runner_utils_EXPORTS) diff --git a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir deleted file mode 100644 index 797f42c37a26f..0000000000000 --- a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir +++ /dev/null @@ -1,128 +0,0 @@ -// RUN: mlir-opt %s --convert-arith-to-apfloat -split-input-file -verify-diagnostics | FileCheck %s - -// CHECK-LABEL: func.func private @_mlir_apfloat_add(i32, i64, i64) -> i64 - -// CHECK-LABEL: func.func @foo() -> f8E4M3FN { -// CHECK: %[[CONSTANT_0:.*]] = arith.constant 2.250000e+00 : f8E4M3FN -// CHECK: return %[[CONSTANT_0]] : f8E4M3FN -// CHECK: } - -// CHECK-LABEL: func.func @bar() -> f6E3M2FN { -// CHECK: %[[CONSTANT_0:.*]] = arith.constant 3.000000e+00 : f6E3M2FN -// CHECK: return %[[CONSTANT_0]] : f6E3M2FN -// CHECK: } - -// Illustrate that both f8E4M3FN and f6E3M2FN calling the same _mlir_apfloat_add is fine -// because each gets its own semantics enum and gets bitcast/extui/trunci to its own width. -// CHECK-LABEL: func.func @full_example() { -// CHECK: %[[CONSTANT_0:.*]] = arith.constant 1.375000e+00 : f8E4M3FN -// CHECK: %[[VAL_0:.*]] = call @foo() : () -> f8E4M3FN -// CHECK: %[[BITCAST_0:.*]] = arith.bitcast %[[CONSTANT_0]] : f8E4M3FN to i8 -// CHECK: %[[EXTUI_0:.*]] = arith.extui %[[BITCAST_0]] : i8 to i64 -// CHECK: %[[BITCAST_1:.*]] = arith.bitcast %[[VAL_0]] : f8E4M3FN to i8 -// CHECK: %[[EXTUI_1:.*]] = arith.extui %[[BITCAST_1]] : i8 to i64 -// // fltSemantics semantics for f8E4M3FN -// CHECK: %[[CONSTANT_1:.*]] = arith.constant 10 : i32 -// CHECK: %[[VAL_1:.*]] = call @_mlir_apfloat_add(%[[CONSTANT_1]], %[[EXTUI_0]], %[[EXTUI_1]]) : (i32, i64, i64) -> i64 -// CHECK: %[[TRUNCI_0:.*]] = arith.trunci %[[VAL_1]] : i64 to i8 -// CHECK: %[[BITCAST_2:.*]] = arith.bitcast %[[TRUNCI_0]] : i8 to f8E4M3FN -// CHECK: vector.print %[[BITCAST_2]] : f8E4M3FN - -// CHECK: %[[CONSTANT_2:.*]] = arith.constant 2.500000e+00 : f6E3M2FN -// CHECK: %[[VAL_2:.*]] = call @bar() : () -> f6E3M2FN -// CHECK: %[[BITCAST_3:.*]] = arith.bitcast %[[CONSTANT_2]] : f6E3M2FN to i6 -// CHECK: %[[EXTUI_2:.*]] = arith.extui %[[BITCAST_3]] : i6 to i64 -// CHECK: %[[BITCAST_4:.*]] = arith.bitcast %[[VAL_2]] : f6E3M2FN to i6 -// CHECK: %[[EXTUI_3:.*]] = arith.extui %[[BITCAST_4]] : i6 to i64 -// // fltSemantics semantics for f6E3M2FN -// CHECK: %[[CONSTANT_3:.*]] = arith.constant 16 : i32 -// CHECK: %[[VAL_3:.*]] = call @_mlir_apfloat_add(%[[CONSTANT_3]], %[[EXTUI_2]], %[[EXTUI_3]]) : (i32, i64, i64) -> i64 -// CHECK: %[[TRUNCI_1:.*]] = arith.trunci %[[VAL_3]] : i64 to i6 -// CHECK: %[[BITCAST_5:.*]] = arith.bitcast %[[TRUNCI_1]] : i6 to f6E3M2FN -// CHECK: vector.print %[[BITCAST_5]] : f6E3M2FN -// CHECK: return -// CHECK: } - -// Put rhs into separate function so that it won't be constant-folded. -func.func @foo() -> f8E4M3FN { - %cst = arith.constant 2.2 : f8E4M3FN - return %cst : f8E4M3FN -} - -func.func @bar() -> f6E3M2FN { - %cst = arith.constant 3.2 : f6E3M2FN - return %cst : f6E3M2FN -} - -func.func @full_example() { - %a = arith.constant 1.4 : f8E4M3FN - %b = func.call @foo() : () -> (f8E4M3FN) - %c = arith.addf %a, %b : f8E4M3FN - vector.print %c : f8E4M3FN - - %d = arith.constant 2.4 : f6E3M2FN - %e = func.call @bar() : () -> (f6E3M2FN) - %f = arith.addf %d, %e : f6E3M2FN - vector.print %f : f6E3M2FN - return -} - -// ----- - -// CHECK: func.func private @_mlir_apfloat_add(i32, i64, i64) -> i64 -// CHECK: %[[sem:.*]] = arith.constant 18 : i32 -// CHECK: call @_mlir_apfloat_add(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64 -func.func @addf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) { - %0 = arith.addf %arg0, %arg1 : f4E2M1FN - return -} - -// ----- - -// Test decl collision (different type) -// expected-error@+1{{matched function '_mlir_apfloat_add' but with different type: '(i32, i32, f32) -> index' (expected '(i32, i64, i64) -> i64')}} -func.func private @_mlir_apfloat_add(i32, i32, f32) -> index -func.func @addf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) { - %0 = arith.addf %arg0, %arg1 : f4E2M1FN - return -} - -// ----- - -// CHECK: func.func private @_mlir_apfloat_subtract(i32, i64, i64) -> i64 -// CHECK: %[[sem:.*]] = arith.constant 18 : i32 -// CHECK: call @_mlir_apfloat_subtract(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64 -func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) { - %0 = arith.subf %arg0, %arg1 : f4E2M1FN - return -} - -// ----- - -// CHECK: func.func private @_mlir_apfloat_multiply(i32, i64, i64) -> i64 -// CHECK: %[[sem:.*]] = arith.constant 18 : i32 -// CHECK: call @_mlir_apfloat_multiply(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64 -func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) { - %0 = arith.mulf %arg0, %arg1 : f4E2M1FN - return -} - -// ----- - -// CHECK: func.func private @_mlir_apfloat_divide(i32, i64, i64) -> i64 -// CHECK: %[[sem:.*]] = arith.constant 18 : i32 -// CHECK: call @_mlir_apfloat_divide(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64 -func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) { - %0 = arith.divf %arg0, %arg1 : f4E2M1FN - return -} - -// ----- - -// CHECK: func.func private @_mlir_apfloat_remainder(i32, i64, i64) -> i64 -// CHECK: %[[sem:.*]] = arith.constant 18 : i32 -// CHECK: call @_mlir_apfloat_remainder(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64 -func.func @remf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) { - %0 = arith.remf %arg0, %arg1 : f4E2M1FN - return -} diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir deleted file mode 100644 index 2768afe0834b5..0000000000000 --- a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir +++ /dev/null @@ -1,36 +0,0 @@ -// Case 1: All floating-point arithmetics is lowered through APFloat. -// RUN: mlir-opt %s --convert-arith-to-apfloat --convert-to-llvm | \ -// RUN: mlir-runner -e entry --entry-point-result=void \ -// RUN: --shared-libs=%mlir_c_runner_utils \ -// RUN: --shared-libs=%mlir_apfloat_wrappers | FileCheck %s - -// Case 2: Only unsupported arithmetics (f8E4M3FN) is lowered through APFloat. -// Arithmetics on f32 is lowered directly to LLVM. -// RUN: mlir-opt %s --convert-to-llvm --convert-arith-to-apfloat \ -// RUN: --convert-to-llvm --reconcile-unrealized-casts | \ -// RUN: mlir-runner -e entry --entry-point-result=void \ -// RUN: --shared-libs=%mlir_c_runner_utils \ -// RUN: --shared-libs=%mlir_apfloat_wrappers | FileCheck %s - -// Put rhs into separate function so that it won't be constant-folded. -func.func @foo() -> (f8E4M3FN, f32) { - %cst1 = arith.constant 2.2 : f8E4M3FN - %cst2 = arith.constant 2.2 : f32 - return %cst1, %cst2 : f8E4M3FN, f32 -} - -func.func @entry() { - %a1 = arith.constant 1.4 : f8E4M3FN - %a2 = arith.constant 1.4 : f32 - %b1, %b2 = func.call @foo() : () -> (f8E4M3FN, f32) - %c1 = arith.addf %a1, %b1 : f8E4M3FN // not supported by LLVM - %c2 = arith.addf %a2, %b2 : f32 // supported by LLVM - - // CHECK: 3.5 - vector.print %c1 : f8E4M3FN - - // CHECK: 3.6 - vector.print %c2 : f32 - - return -} diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py index 4a38ed605be0c..6ff12d66523f5 100644 --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -208,7 +208,6 @@ def find_real_python_interpreter(): add_runtime("mlir_c_runner_utils"), add_runtime("mlir_async_runtime"), add_runtime("mlir_float16_utils"), - add_runtime("mlir_apfloat_wrappers"), "mlir-linalg-ods-yaml-gen", "mlir-reduce", "mlir-pdll", From 5eb8d290dc48e88cdc028f9afd1a83f4605a95b3 Mon Sep 17 00:00:00 2001 From: vangthao95 Date: Tue, 11 Nov 2025 09:31:59 -0800 Subject: [PATCH 05/64] [AMDGPU][GlobalISel] Add RegBankLegalize support for G_BLOCK_ADDR and G_GLOBAL_VALUE (#165340) --- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp | 9 ++ .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 13 +++ .../AMDGPU/AMDGPURegBankLegalizeRules.h | 4 + .../GlobalISel/global-value-addrspaces.ll | 104 ++++++++++++++++++ .../CodeGen/AMDGPU/GlobalISel/lds-relocs.ll | 2 +- .../AMDGPU/GlobalISel/lds-zero-initializer.ll | 8 +- .../GlobalISel/regbankselect-block-addr.mir | 2 +- 7 files changed, 136 insertions(+), 6 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/global-value-addrspaces.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index dc8fa7f0eef49..1765d054a3c0d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -873,6 +873,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case Sgpr128: case Vgpr128: return LLT::scalar(128); + case SgprP0: case VgprP0: return LLT::pointer(0, 64); case SgprP1: @@ -887,6 +888,8 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case SgprP5: case VgprP5: return LLT::pointer(5, 32); + case SgprP8: + return LLT::pointer(8, 128); case SgprV2S16: case VgprV2S16: case UniInVgprV2S16: @@ -972,10 +975,12 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case Sgpr32_WF: case Sgpr64: case Sgpr128: + case SgprP0: case SgprP1: case SgprP3: case SgprP4: case SgprP5: + case SgprP8: case SgprPtr32: case SgprPtr64: case SgprPtr128: @@ -1055,10 +1060,12 @@ void RegBankLegalizeHelper::applyMappingDst( case Sgpr32: case Sgpr64: case Sgpr128: + case SgprP0: case SgprP1: case SgprP3: case SgprP4: case SgprP5: + case SgprP8: case SgprV2S16: case SgprV2S32: case SgprV4S32: @@ -1198,10 +1205,12 @@ void RegBankLegalizeHelper::applyMappingSrc( case Sgpr32: case Sgpr64: case Sgpr128: + case SgprP0: case SgprP1: case SgprP3: case SgprP4: case SgprP5: + case SgprP8: case SgprV2S16: case SgprV2S32: case SgprV4S32: { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 1e5885a25c195..615b911a22903 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -66,6 +66,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::pointer(4, 64); case P5: return MRI.getType(Reg) == LLT::pointer(5, 32); + case P8: + return MRI.getType(Reg) == LLT::pointer(8, 128); case Ptr32: return isAnyPtr(MRI.getType(Reg), 32); case Ptr64: @@ -108,6 +110,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg); case UniP5: return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg); + case UniP8: + return MRI.getType(Reg) == LLT::pointer(8, 128) && MUI.isUniform(Reg); case UniPtr32: return isAnyPtr(MRI.getType(Reg), 32) && MUI.isUniform(Reg); case UniPtr64: @@ -918,6 +922,15 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard) .Uni(S64, {{Sgpr64}, {}}); + addRulesForGOpcs({G_BLOCK_ADDR}).Any({{UniP0}, {{SgprP0}, {}}}); + + addRulesForGOpcs({G_GLOBAL_VALUE}) + .Any({{UniP0}, {{SgprP0}, {}}}) + .Any({{UniP1}, {{SgprP1}, {}}}) + .Any({{UniP3}, {{SgprP3}, {}}}) + .Any({{UniP4}, {{SgprP4}, {}}}) + .Any({{UniP8}, {{SgprP8}, {}}}); + bool hasSALUFloat = ST->hasSALUFloatInsts(); addRulesForGOpcs({G_FADD}, Standard) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index e6df5d87a2edc..7e4ce7b43dc3b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -63,6 +63,7 @@ enum UniformityLLTOpPredicateID { P3, P4, P5, + P8, Ptr32, Ptr64, Ptr128, @@ -72,6 +73,7 @@ enum UniformityLLTOpPredicateID { UniP3, UniP4, UniP5, + UniP8, UniPtr32, UniPtr64, UniPtr128, @@ -136,10 +138,12 @@ enum RegBankLLTMappingApplyID { Sgpr32, Sgpr64, Sgpr128, + SgprP0, SgprP1, SgprP3, SgprP4, SgprP5, + SgprP8, SgprPtr32, SgprPtr64, SgprPtr128, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value-addrspaces.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value-addrspaces.ll new file mode 100644 index 0000000000000..cf9524b860fd2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value-addrspaces.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s + +@flat = external global i32, align 4 +@global = external addrspace(1) global i32, align 4 +@lds = addrspace(3) global i32 poison, align 4 +@constant = external addrspace(4) constant i32, align 4 +@buf = external addrspace(8) global i8 + +define ptr @global_value_as0_external() { +; GCN-LABEL: global_value_as0_external: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, flat@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, flat@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_setpc_b64 s[30:31] + ret ptr @flat +} + +define ptr addrspace(1) @global_value_as1_external() { +; GCN-LABEL: global_value_as1_external: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, global@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, global@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_setpc_b64 s[30:31] + ret ptr addrspace(1) @global +} + +define ptr addrspace(4) @global_value_as4_external() { +; GCN-LABEL: global_value_as4_external: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, constant@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, constant@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_setpc_b64 s[30:31] + ret ptr addrspace(4) @constant +} + +define amdgpu_kernel void @global_value_as3_lds_kernel(ptr addrspace(1) %out) { +; GCN-LABEL: global_value_as3_lds_kernel: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_dword v0, v0, s[0:1] +; GCN-NEXT: s_endpgm + %addr = ptrtoint ptr addrspace(3) @lds to i32 + store i32 %addr, ptr addrspace(1) %out + ret void +} + +define void @global_value_as8_buffer_store(i32 %val) { +; GCN-LABEL: global_value_as8_buffer_store: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s8, s8, buf@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s9, s9, buf@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %val, ptr addrspace(8) @buf, i32 0, i32 0, i32 0) + ret void +} + +define i32 @global_value_as8_buffer_load(i32 %offset) { +; GCN-LABEL: global_value_as8_buffer_load: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s8, s8, buf@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s9, s9, buf@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %val = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) @buf, i32 %offset, i32 0, i32 0) + ret i32 %val +} + +declare void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32, ptr addrspace(8) nocapture writeonly, i32, i32, i32 immarg) #0 +declare i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg) #1 + +attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll index 82886ab9e7d55..e1ac8ba5e6db4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll @@ -1,4 +1,4 @@ -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s ; FIXME: Merge with DAG test @lds.external = external unnamed_addr addrspace(3) global [0 x i32] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll index cabb37c330b4a..3396eaedf359e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll @@ -1,8 +1,8 @@ -; RUN: llc -mtriple=amdgcn -mcpu=tahiti -global-isel -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti -global-isel -new-reg-bank-select -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel -new-reg-bank-select -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: not llc -mtriple=amdgcn -mcpu=tahiti -global-isel < %s 2>&1 | FileCheck %s -; RUN: not llc -mtriple=amdgcn -mcpu=tonga -global-isel < %s 2>&1 | FileCheck %s +; RUN: not llc -mtriple=amdgcn -mcpu=tahiti -global-isel -new-reg-bank-select < %s 2>&1 | FileCheck %s +; RUN: not llc -mtriple=amdgcn -mcpu=tonga -global-isel -new-reg-bank-select < %s 2>&1 | FileCheck %s ; CHECK: error: lds: unsupported initializer for address space diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir index a50c7fe0748b8..fc86dd884fac0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -march amdgcn -mcpu=fiji -run-pass=regbankselect %s -o - | FileCheck %s +# RUN: llc -O0 -march amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s --- | From 385aa012128f632be06da5b813e0f6ed8b7c05cc Mon Sep 17 00:00:00 2001 From: Riyaz Ahmad <69200125+riyaz86a@users.noreply.github.com> Date: Tue, 11 Nov 2025 23:02:29 +0530 Subject: [PATCH 06/64] [Asan] Ensure minimum stack size 128KB in ThreadedStressStackReuseTest (#165198) Asan test `ThreadedStressStackReuseTest ` fails on AIX due to smaller default thread stack size. Set thread stack size to a minimum of 128KB to ensure reliable test behavior across platforms (platforms with smaller default thread stack size). --------- Co-authored-by: Riyaz Ahmad --- compiler-rt/lib/asan/tests/asan_test.cpp | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/compiler-rt/lib/asan/tests/asan_test.cpp b/compiler-rt/lib/asan/tests/asan_test.cpp index 2d23a12cc6ae2..59d64ac4753ca 100644 --- a/compiler-rt/lib/asan/tests/asan_test.cpp +++ b/compiler-rt/lib/asan/tests/asan_test.cpp @@ -1115,15 +1115,28 @@ TEST(AddressSanitizer, StressStackReuseTest) { LotsOfStackReuse(); } +// On some platform (ex: AIX), the default thread stack size (~96 KB) is +// insufficient for this test and can lead to stack overflows. +#define MIN_STACK_SIZE (128 * 1024) // 128 KB TEST(AddressSanitizer, ThreadedStressStackReuseTest) { const int kNumThreads = 20; pthread_t t[kNumThreads]; + size_t curStackSize = 0; + pthread_attr_t attr; + pthread_attr_init(&attr); + // Get the current (default) thread stack size + pthread_attr_getstacksize(&attr, &curStackSize); + if (curStackSize < MIN_STACK_SIZE) { + int rc = pthread_attr_setstacksize(&attr, MIN_STACK_SIZE); + ASSERT_EQ(0, rc); + } for (int i = 0; i < kNumThreads; i++) { - PTHREAD_CREATE(&t[i], 0, (void* (*)(void *x))LotsOfStackReuse, 0); + PTHREAD_CREATE(&t[i], &attr, (void* (*)(void* x))LotsOfStackReuse, 0); } for (int i = 0; i < kNumThreads; i++) { PTHREAD_JOIN(t[i], 0); } + pthread_attr_destroy(&attr); } // pthread_exit tries to perform unwinding stuff that leads to dlopen'ing From 2690f05522c6de0440403bc8ca0fcaf647e17c8a Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 11 Nov 2025 09:37:09 -0800 Subject: [PATCH 07/64] [SystemZ] Use MCRegister instead of unsigned. NFC (#167539) --- .../SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp index 275165d2acb07..a24543b699ab4 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp @@ -218,7 +218,7 @@ void SystemZInstPrinterCommon::printBDXAddrOperand(const MCInst *MI, int OpNum, void SystemZInstPrinterCommon::printBDLAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O) { - unsigned Base = MI->getOperand(OpNum).getReg(); + MCRegister Base = MI->getOperand(OpNum).getReg(); const MCOperand &DispMO = MI->getOperand(OpNum + 1); uint64_t Length = MI->getOperand(OpNum + 2).getImm(); printOperand(DispMO, &MAI, O); @@ -232,9 +232,9 @@ void SystemZInstPrinterCommon::printBDLAddrOperand(const MCInst *MI, int OpNum, void SystemZInstPrinterCommon::printBDRAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O) { - unsigned Base = MI->getOperand(OpNum).getReg(); + MCRegister Base = MI->getOperand(OpNum).getReg(); const MCOperand &DispMO = MI->getOperand(OpNum + 1); - unsigned Length = MI->getOperand(OpNum + 2).getReg(); + MCRegister Length = MI->getOperand(OpNum + 2).getReg(); printOperand(DispMO, &MAI, O); O << "("; printRegName(O, Length); From 04df359ae1f0209500c4dd4af1167b22e1b75d5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Tue, 11 Nov 2025 17:39:56 +0000 Subject: [PATCH 08/64] [mlir][vector] Simplify createReadOrMaskedRead (#163736) Simplify `createReadOrMaskedRead` to only require _one_ argument to specify the vector type to read (passed as `VectorType`) instead of passing vector-sizes and scalable-flags independently (i.e. _two_ arguments). A simple overload is provided for users that wouldn't re-use the corresponding `VectorType` (and hence there's no point for them to create). While there are no users upstream for this overload, it's been helpful downstream. --- .../mlir/Dialect/Vector/Utils/VectorUtils.h | 10 +++-- .../Linalg/Transforms/Vectorization.cpp | 26 +++++------ mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp | 43 +++++++++++++------ 3 files changed, 50 insertions(+), 29 deletions(-) diff --git a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h index a57aadcdcc5b0..45626aa280946 100644 --- a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h +++ b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h @@ -219,13 +219,17 @@ bool isLinearizableVector(VectorType type); /// Creates a TransferReadOp from `source`. /// -/// The shape of the vector to read is specified via `inputVectorSizes`. If the -/// shape of the output vector differs from the shape of the value being read, -/// masking is used to avoid out-of-bounds accesses. Set +/// If the shape of vector to read differs from the shape of the value being +/// read, masking is used to avoid out-of-bounds accesses. Set /// `useInBoundsInsteadOfMasking` to `true` to use the "in_bounds" attribute /// instead of explicit masks. /// /// Note: all read offsets are set to 0. +Value createReadOrMaskedRead(OpBuilder &builder, Location loc, Value source, + const VectorType &vecToReadTy, + std::optional padValue = std::nullopt, + bool useInBoundsInsteadOfMasking = false); + Value createReadOrMaskedRead(OpBuilder &builder, Location loc, Value source, ArrayRef inputVectorSizes, std::optional padValue = std::nullopt, diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 19d2d854a3838..dcf84c46949f3 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -1890,9 +1890,8 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, linalg::PackOp packOp, // Create masked TransferReadOp. auto maskedRead = vector::createReadOrMaskedRead( - rewriter, loc, packOp.getSource(), readVecType.getShape(), padValue, - useInBoundsInsteadOfMasking, - /*inputScalableVecSizes=*/{}); + rewriter, loc, packOp.getSource(), readVecType, padValue, + useInBoundsInsteadOfMasking); // Create ShapeCastOp. auto shapeCastOp = vector::ShapeCastOp::create( @@ -1977,9 +1976,12 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp, } // -- Generate the read operation -- + VectorType readVecType = + VectorType::get(readVectorSizes, unpackTensorType.getElementType(), + readScalableVectorFlags); Value readResult = vector::createReadOrMaskedRead( - rewriter, loc, unpackOp.getSource(), readVectorSizes, std::nullopt, - useInBoundsInsteadOfMasking, readScalableVectorFlags); + rewriter, loc, unpackOp.getSource(), readVecType, std::nullopt, + useInBoundsInsteadOfMasking); // -- Generate the transpose operation -- PackingMetadata packMetadata; @@ -2025,9 +2027,10 @@ vectorizeAsTensorPadOp(RewriterBase &rewriter, tensor::PadOp padOp, .reifyResultShapes(rewriter, reifiedReturnShapes); (void)status; // prevent unused variable warning on non-assert builds assert(succeeded(status) && "failed to reify result shapes"); + auto readType = VectorType::get(inputVectorSizes, padValue.getType()); auto maskedRead = vector::createReadOrMaskedRead( - rewriter, loc, padOp.getSource(), inputVectorSizes, padValue, - /*useInBoundsInsteadOfMasking=*/false, /*inputScalableVecSizes=*/{}); + rewriter, loc, padOp.getSource(), readType, padValue, + /*useInBoundsInsteadOfMasking=*/false); // Create Xfer write Op Value dest = tensor::EmptyOp::create(rewriter, loc, reifiedReturnShapes[0], @@ -2222,9 +2225,9 @@ vectorizeAsLinalgContraction(RewriterBase &rewriter, VectorizationState &state, state.getCanonicalVecType(elemType, readMap.compose(indexingMap)); Value read = mlir::vector::createReadOrMaskedRead( - rewriter, loc, opOperand.get(), readType.getShape(), + rewriter, loc, opOperand.get(), readType, /*padding=*/arith::getZeroConstant(rewriter, loc, elemType), - /*useInBoundsInsteadOfMasking=*/false, readType.getScalableDims()); + /*useInBoundsInsteadOfMasking=*/false); vecOperands.push_back(read); } @@ -3165,9 +3168,8 @@ vectorizeAsInsertSliceOp(RewriterBase &rewriter, tensor::InsertSliceOp sliceOp, SmallVector readIndices( vecType.getRank(), arith::ConstantIndexOp::create(rewriter, loc, 0)); Value read = mlir::vector::createReadOrMaskedRead( - rewriter, loc, source, vecType.getShape(), padValue, - /*useInBoundsInsteadOfMasking=*/inputVectorSizes.empty(), - /*inputScalableVecSizes=*/{}); + rewriter, loc, source, vecType, padValue, + /*useInBoundsInsteadOfMasking=*/inputVectorSizes.empty()); // Create write auto writeIndices = diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp index c809c50206793..c307fb441e3ad 100644 --- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp +++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp @@ -322,46 +322,61 @@ Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc, std::optional padValue, bool useInBoundsInsteadOfMasking, ArrayRef inputScalableVecDims) { - assert(!llvm::is_contained(inputVectorSizes, ShapedType::kDynamic) && + VectorType vecToReadTy = VectorType::get( + inputVectorSizes, cast(source.getType()).getElementType(), + inputScalableVecDims); + + return createReadOrMaskedRead(builder, loc, source, vecToReadTy, padValue, + useInBoundsInsteadOfMasking); +} + +Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc, + Value source, + const VectorType &vecToReadTy, + std::optional padValue, + bool useInBoundsInsteadOfMasking) { + assert(!llvm::is_contained(vecToReadTy.getScalableDims(), + ShapedType::kDynamic) && "invalid input vector sizes"); auto sourceShapedType = cast(source.getType()); auto sourceShape = sourceShapedType.getShape(); - assert(sourceShape.size() == inputVectorSizes.size() && + + int64_t vecToReadRank = vecToReadTy.getRank(); + auto vecToReadShape = vecToReadTy.getShape(); + + assert(sourceShape.size() == static_cast(vecToReadRank) && "expected same ranks."); - auto vectorType = - VectorType::get(inputVectorSizes, sourceShapedType.getElementType(), - inputScalableVecDims); assert((!padValue.has_value() || padValue.value().getType() == sourceShapedType.getElementType()) && "expected same pad element type to match source element type"); - int64_t readRank = inputVectorSizes.size(); + auto zero = arith::ConstantIndexOp::create(builder, loc, 0); - SmallVector inBoundsVal(readRank, true); + SmallVector inBoundsVal(vecToReadRank, true); if (useInBoundsInsteadOfMasking) { // Update the inBounds attribute. // FIXME: This computation is too weak - it ignores the read indices. - for (unsigned i = 0; i < readRank; i++) - inBoundsVal[i] = (sourceShape[i] == inputVectorSizes[i]) && + for (unsigned i = 0; i < vecToReadRank; i++) + inBoundsVal[i] = (sourceShape[i] == vecToReadShape[i]) && ShapedType::isStatic(sourceShape[i]); } auto transferReadOp = vector::TransferReadOp::create( builder, loc, - /*vectorType=*/vectorType, + /*vectorType=*/vecToReadTy, /*source=*/source, - /*indices=*/SmallVector(readRank, zero), + /*indices=*/SmallVector(vecToReadRank, zero), /*padding=*/padValue, /*inBounds=*/inBoundsVal); - if (llvm::equal(inputVectorSizes, sourceShape) || useInBoundsInsteadOfMasking) + if (llvm::equal(vecToReadTy.getShape(), sourceShape) || + useInBoundsInsteadOfMasking) return transferReadOp; SmallVector mixedSourceDims = isa(source.getType()) ? memref::getMixedSizes(builder, loc, source) : tensor::getMixedSizes(builder, loc, source); - auto maskType = VectorType::get(inputVectorSizes, builder.getI1Type(), - inputScalableVecDims); + auto maskType = vecToReadTy.cloneWith(/*shape=*/{}, builder.getI1Type()); Value mask = vector::CreateMaskOp::create(builder, loc, maskType, mixedSourceDims); return mlir::vector::maskOperation(builder, transferReadOp, mask) From 1a4c19d4e590ae1ea7e6175dac8f3574e2314486 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Tue, 11 Nov 2025 09:50:48 -0800 Subject: [PATCH 09/64] [ProfCheck] Mark Some profverify Tests as Unsupported (#167544) These tests fail in the profcheck configuration because profinject gets added to the pipeline and adds metadata that changes the input PGO information. --- llvm/test/Transforms/PGOProfile/prof-verify-no-entrycount.ll | 4 ++++ llvm/test/Transforms/PGOProfile/prof-verify.ll | 4 ++++ llvm/test/Transforms/PGOProfile/profcheck-select.ll | 4 ++++ llvm/test/lit.cfg.py | 2 ++ llvm/utils/profcheck-xfail.txt | 3 --- 5 files changed, 14 insertions(+), 3 deletions(-) diff --git a/llvm/test/Transforms/PGOProfile/prof-verify-no-entrycount.ll b/llvm/test/Transforms/PGOProfile/prof-verify-no-entrycount.ll index 3b059fd7d8800..9c5f046af47af 100644 --- a/llvm/test/Transforms/PGOProfile/prof-verify-no-entrycount.ll +++ b/llvm/test/Transforms/PGOProfile/prof-verify-no-entrycount.ll @@ -1,3 +1,7 @@ +; This test fails under the profcheck configuration due to profcheck creating +; metadata. +; UNSUPPORTED: profcheck + ; Test prof-verify for functions without entry count ; RUN: not opt -passes=prof-verify %s -o - 2>&1 | FileCheck %s diff --git a/llvm/test/Transforms/PGOProfile/prof-verify.ll b/llvm/test/Transforms/PGOProfile/prof-verify.ll index 50159506e8313..75d1e6a3db571 100644 --- a/llvm/test/Transforms/PGOProfile/prof-verify.ll +++ b/llvm/test/Transforms/PGOProfile/prof-verify.ll @@ -1,3 +1,7 @@ +; This test fails under the profcheck configuration due to profcheck creating +; metadata. +; UNSUPPORTED: profcheck + ; Test prof-inject and prof-verify ; RUN: opt -passes=prof-inject %s -S -o - | FileCheck %s --check-prefix=INJECT diff --git a/llvm/test/Transforms/PGOProfile/profcheck-select.ll b/llvm/test/Transforms/PGOProfile/profcheck-select.ll index b5dc97d2d5a6d..74bcb3f52428b 100644 --- a/llvm/test/Transforms/PGOProfile/profcheck-select.ll +++ b/llvm/test/Transforms/PGOProfile/profcheck-select.ll @@ -1,3 +1,7 @@ +; This test fails under the profcheck configuration due to profcheck creating +; metadata. +; UNSUPPORTED: profcheck + ; RUN: split-file %s %t ; RUN: opt -passes=prof-inject %t/inject.ll -S -o - | FileCheck %t/inject.ll diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 725ddb877f9ec..94cf8bc358514 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -69,6 +69,8 @@ # profiling doesn't work quite well on GPU, excluding config.excludes.append("AMDGPU") + config.available_features.add("profcheck") + # test_source_root: The root path where tests are located. config.test_source_root = os.path.dirname(__file__) diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt index b1f20a73c3b2b..15f2c79784d1e 100644 --- a/llvm/utils/profcheck-xfail.txt +++ b/llvm/utils/profcheck-xfail.txt @@ -614,9 +614,6 @@ Transforms/PGOProfile/chr-lifetimes.ll Transforms/PGOProfile/chr-poison.ll Transforms/PGOProfile/comdat.ll Transforms/PGOProfile/memop_profile_funclet_wasm.ll -Transforms/PGOProfile/profcheck-select.ll -Transforms/PGOProfile/prof-verify.ll -Transforms/PGOProfile/prof-verify-no-entrycount.ll Transforms/PGOProfile/X86/macho.ll Transforms/PhaseOrdering/AArch64/constraint-elimination-placement.ll Transforms/PhaseOrdering/AArch64/globals-aa-required-for-vectorization.ll From a4a6b5ff63ecdea5512d47e192abd1ed81390478 Mon Sep 17 00:00:00 2001 From: Marcell Leleszi <59964679+mleleszi@users.noreply.github.com> Date: Tue, 11 Nov 2025 18:54:09 +0100 Subject: [PATCH 10/64] [libc] Refactor strftime internals to handle size_t return values (#166901) Now that https://github.com/llvm/llvm-project/pull/166517 has landed and [Writer](https://github.com/llvm/llvm-project/blob/main/libc/src/stdio/printf_core/writer.h#L130) has been refactored to track bytes written as size_t, strftime can be refactored as well to handle size_t return values. Can't think of a proper way to test this without creating a 2GB+ string, but existing tests cover most cases. --- libc/src/time/strftime.cpp | 4 ++-- libc/src/time/strftime_core/CMakeLists.txt | 1 + libc/src/time/strftime_core/strftime_main.h | 10 +++++----- libc/src/time/strftime_l.cpp | 4 ++-- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/libc/src/time/strftime.cpp b/libc/src/time/strftime.cpp index 89b7d9bb7c1b9..ff8c05a0b07da 100644 --- a/libc/src/time/strftime.cpp +++ b/libc/src/time/strftime.cpp @@ -23,10 +23,10 @@ LLVM_LIBC_FUNCTION(size_t, strftime, printf_core::WriteMode::FILL_BUFF_AND_DROP_OVERFLOW>::value> wb(buffer, (buffsz > 0 ? buffsz - 1 : 0)); printf_core::Writer writer(wb); - int ret = strftime_core::strftime_main(&writer, format, timeptr); + auto ret = strftime_core::strftime_main(&writer, format, timeptr); if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer. wb.buff[wb.buff_cur] = '\0'; - return (ret < 0 || static_cast(ret) >= buffsz) ? 0 : ret; + return (!ret.has_value() || ret.value() >= buffsz) ? 0 : ret.value(); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/time/strftime_core/CMakeLists.txt b/libc/src/time/strftime_core/CMakeLists.txt index 3ffd283ead7fe..a9aa573cc9a63 100644 --- a/libc/src/time/strftime_core/CMakeLists.txt +++ b/libc/src/time/strftime_core/CMakeLists.txt @@ -43,6 +43,7 @@ add_header_library( .core_structs .parser .converter + libc.src.__support.error_or libc.src.stdio.printf_core.writer libc.hdr.types.struct_tm ) diff --git a/libc/src/time/strftime_core/strftime_main.h b/libc/src/time/strftime_core/strftime_main.h index 2b136d83234cd..855a44107914c 100644 --- a/libc/src/time/strftime_core/strftime_main.h +++ b/libc/src/time/strftime_core/strftime_main.h @@ -10,6 +10,7 @@ #define LLVM_LIBC_SRC_STDIO_STRFTIME_CORE_STRFTIME_MAIN_H #include "hdr/types/struct_tm.h" +#include "src/__support/error_or.h" #include "src/__support/macros/config.h" #include "src/stdio/printf_core/writer.h" #include "src/time/strftime_core/converter.h" @@ -20,8 +21,8 @@ namespace LIBC_NAMESPACE_DECL { namespace strftime_core { template -int strftime_main(printf_core::Writer *writer, - const char *__restrict str, const tm *timeptr) { +ErrorOr strftime_main(printf_core::Writer *writer, + const char *__restrict str, const tm *timeptr) { Parser parser(str); int result = 0; for (strftime_core::FormatSection cur_section = parser.get_next_section(); @@ -33,11 +34,10 @@ int strftime_main(printf_core::Writer *writer, result = writer->write(cur_section.raw_string); if (result < 0) - return result; + return Error(-result); } - // TODO: Use ErrorOr - return static_cast(writer->get_chars_written()); + return writer->get_chars_written(); } } // namespace strftime_core diff --git a/libc/src/time/strftime_l.cpp b/libc/src/time/strftime_l.cpp index 409f8683b7289..2ec90634ea347 100644 --- a/libc/src/time/strftime_l.cpp +++ b/libc/src/time/strftime_l.cpp @@ -26,10 +26,10 @@ LLVM_LIBC_FUNCTION(size_t, strftime_l, printf_core::WriteMode::FILL_BUFF_AND_DROP_OVERFLOW>::value> wb(buffer, (buffsz > 0 ? buffsz - 1 : 0)); printf_core::Writer writer(wb); - int ret = strftime_core::strftime_main(&writer, format, timeptr); + auto ret = strftime_core::strftime_main(&writer, format, timeptr); if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer. wb.buff[wb.buff_cur] = '\0'; - return (ret < 0 || static_cast(ret) >= buffsz) ? 0 : ret; + return (!ret.has_value() || ret.value() >= buffsz) ? 0 : ret.value(); } } // namespace LIBC_NAMESPACE_DECL From 497dc100c9c4487141af342aac26da789a23e0a3 Mon Sep 17 00:00:00 2001 From: Anshul Nigham Date: Tue, 11 Nov 2025 09:54:21 -0800 Subject: [PATCH 11/64] [libc] Implement fchown (#167286) Implements fchown fixes: #166856 --- libc/config/linux/aarch64/entrypoints.txt | 1 + libc/config/linux/x86_64/entrypoints.txt | 1 + libc/include/unistd.yaml | 8 ++++ libc/src/unistd/CMakeLists.txt | 7 ++++ libc/src/unistd/fchown.h | 22 ++++++++++ libc/src/unistd/linux/CMakeLists.txt | 14 +++++++ libc/src/unistd/linux/fchown.cpp | 31 ++++++++++++++ libc/test/src/unistd/CMakeLists.txt | 20 +++++++++ libc/test/src/unistd/fchown_test.cpp | 50 +++++++++++++++++++++++ 9 files changed, 154 insertions(+) create mode 100644 libc/src/unistd/fchown.h create mode 100644 libc/src/unistd/linux/fchown.cpp create mode 100644 libc/test/src/unistd/fchown_test.cpp diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index e0dd15b803253..144237aee7f93 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -327,6 +327,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.execve libc.src.unistd.faccessat libc.src.unistd.fchdir + libc.src.unistd.fchown libc.src.unistd.fpathconf libc.src.unistd.fsync libc.src.unistd.ftruncate diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index a44e2041e57f2..f4012514fe20e 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -334,6 +334,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.execve libc.src.unistd.faccessat libc.src.unistd.fchdir + libc.src.unistd.fchown libc.src.unistd.fpathconf libc.src.unistd.fsync libc.src.unistd.ftruncate diff --git a/libc/include/unistd.yaml b/libc/include/unistd.yaml index 0e5b22e627b67..3f5e957768533 100644 --- a/libc/include/unistd.yaml +++ b/libc/include/unistd.yaml @@ -120,6 +120,14 @@ functions: return_type: int arguments: - type: int + - name: fchown + standards: + - POSIX + return_type: int + arguments: + - type: int + - type: uid_t + - type: gid_t - name: fork standards: - POSIX diff --git a/libc/src/unistd/CMakeLists.txt b/libc/src/unistd/CMakeLists.txt index 337480cbbf928..b7444a4722b0d 100644 --- a/libc/src/unistd/CMakeLists.txt +++ b/libc/src/unistd/CMakeLists.txt @@ -76,6 +76,13 @@ add_entrypoint_object( .${LIBC_TARGET_OS}.fchdir ) +add_entrypoint_object( + fchown + ALIAS + DEPENDS + .${LIBC_TARGET_OS}.fchown +) + add_entrypoint_object( fork ALIAS diff --git a/libc/src/unistd/fchown.h b/libc/src/unistd/fchown.h new file mode 100644 index 0000000000000..9ea44426568cc --- /dev/null +++ b/libc/src/unistd/fchown.h @@ -0,0 +1,22 @@ +//===-- Implementation header for fchown ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_UNISTD_FCHOWN_H +#define LLVM_LIBC_SRC_UNISTD_FCHOWN_H + +#include "hdr/types/gid_t.h" +#include "hdr/types/uid_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int fchown(int fildes, uid_t owner, gid_t group); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_UNISTD_FCHOWN_H diff --git a/libc/src/unistd/linux/CMakeLists.txt b/libc/src/unistd/linux/CMakeLists.txt index c2dacc6456e27..c45b6ef1c5d80 100644 --- a/libc/src/unistd/linux/CMakeLists.txt +++ b/libc/src/unistd/linux/CMakeLists.txt @@ -120,6 +120,20 @@ add_entrypoint_object( libc.src.errno.errno ) +add_entrypoint_object( + fchown + SRCS + fchown.cpp + HDRS + ../fchown.h + DEPENDS + libc.hdr.types.uid_t + libc.hdr.types.gid_t + libc.include.sys_syscall + libc.src.__support.OSUtil.osutil + libc.src.errno.errno +) + add_entrypoint_object( fork SRCS diff --git a/libc/src/unistd/linux/fchown.cpp b/libc/src/unistd/linux/fchown.cpp new file mode 100644 index 0000000000000..9cf3d139050c1 --- /dev/null +++ b/libc/src/unistd/linux/fchown.cpp @@ -0,0 +1,31 @@ +//===-- Linux implementation of fchown ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/unistd/fchown.h" + +#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/common.h" + +#include "hdr/types/gid_t.h" +#include "hdr/types/uid_t.h" +#include "src/__support/libc_errno.h" +#include "src/__support/macros/config.h" +#include // For syscall numbers. + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, fchown, (int fildes, uid_t owner, gid_t group)) { + int ret = LIBC_NAMESPACE::syscall_impl(SYS_fchown, fildes, owner, group); + if (ret < 0) { + libc_errno = -ret; + return -1; + } + return 0; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/unistd/CMakeLists.txt b/libc/test/src/unistd/CMakeLists.txt index 07070535459ec..3012ea9a466f4 100644 --- a/libc/test/src/unistd/CMakeLists.txt +++ b/libc/test/src/unistd/CMakeLists.txt @@ -146,6 +146,26 @@ add_libc_unittest( libc.test.UnitTest.ErrnoSetterMatcher ) +add_libc_unittest( + fchown_test + SUITE + libc_unistd_unittests + SRCS + fchown_test.cpp + DEPENDS + libc.hdr.fcntl_macros + libc.include.unistd + libc.src.errno.errno + libc.src.unistd.fchown + libc.src.unistd.close + libc.src.unistd.unlink + libc.src.fcntl.open + libc.src.unistd.getuid + libc.src.unistd.getgid + libc.test.UnitTest.ErrnoCheckingTest + libc.test.UnitTest.ErrnoSetterMatcher +) + add_libc_unittest( ftruncate_test SUITE diff --git a/libc/test/src/unistd/fchown_test.cpp b/libc/test/src/unistd/fchown_test.cpp new file mode 100644 index 0000000000000..7954410afb929 --- /dev/null +++ b/libc/test/src/unistd/fchown_test.cpp @@ -0,0 +1,50 @@ +//===-- Unittests for fchown ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/fcntl/open.h" +#include "src/unistd/close.h" +#include "src/unistd/fchown.h" +#include "src/unistd/getgid.h" +#include "src/unistd/getuid.h" +#include "src/unistd/unlink.h" + +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" +#include "test/UnitTest/Test.h" + +#include "hdr/fcntl_macros.h" +#include + +using LlvmLibcFchownTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcFchownTest, FchownSuccess) { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + uid_t my_uid = LIBC_NAMESPACE::getuid(); + gid_t my_gid = LIBC_NAMESPACE::getgid(); + constexpr const char *FILENAME = "fchown.test"; + auto TEST_FILE = libc_make_test_file_path(FILENAME); + + // Create a test file. + int write_fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT, S_IRWXU); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(write_fd, 0); + + // Change the ownership of the file. + ASSERT_THAT(LIBC_NAMESPACE::fchown(write_fd, my_uid, my_gid), Succeeds(0)); + + // Close the file descriptor. + ASSERT_THAT(LIBC_NAMESPACE::close(write_fd), Succeeds(0)); + + // Clean up the test file. + ASSERT_THAT(LIBC_NAMESPACE::unlink(TEST_FILE), Succeeds(0)); +} + +TEST_F(LlvmLibcFchownTest, FchownInvalidFileDescriptor) { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; + ASSERT_THAT(LIBC_NAMESPACE::fchown(-1, 1000, 1000), Fails(EBADF)); +} From 79601cec3366de4b15787d0f4e1c1a6000538180 Mon Sep 17 00:00:00 2001 From: Anton Shepelev <44649959+amemov@users.noreply.github.com> Date: Tue, 11 Nov 2025 09:54:34 -0800 Subject: [PATCH 12/64] [libc][POSIX] Add clock_settime() function for Linux (#161729) Closes #161461 - This is my first time contributing to libc's POSIX, so for reference I used `clock_gettime` implementation for Linux. For convenience, here is the description of `clock_settime` function [behavior](https://www.man7.org/linux/man-pages/man3/clock_settime.3.html) --- libc/config/linux/aarch64/entrypoints.txt | 1 + libc/config/linux/riscv/entrypoints.txt | 1 + libc/config/linux/x86_64/entrypoints.txt | 1 + libc/docs/headers/time.rst | 4 +- libc/include/time.yaml | 7 +++ libc/src/__support/time/CMakeLists.txt | 9 ++++ libc/src/__support/time/clock_settime.h | 22 ++++++++ libc/src/__support/time/linux/CMakeLists.txt | 15 ++++++ .../__support/time/linux/clock_settime.cpp | 53 ++++++++++++++++++ libc/src/time/CMakeLists.txt | 8 +++ libc/src/time/clock_settime.h | 22 ++++++++ libc/src/time/linux/CMakeLists.txt | 13 +++++ libc/src/time/linux/clock.cpp | 2 +- libc/src/time/linux/clock_gettime.cpp | 3 +- libc/src/time/linux/clock_settime.cpp | 30 +++++++++++ libc/src/time/linux/nanosleep.cpp | 3 +- libc/src/time/linux/timespec_get.cpp | 2 +- libc/test/src/time/CMakeLists.txt | 15 ++++++ libc/test/src/time/clock_settime_test.cpp | 54 +++++++++++++++++++ 19 files changed, 257 insertions(+), 8 deletions(-) create mode 100644 libc/src/__support/time/clock_settime.h create mode 100644 libc/src/__support/time/linux/clock_settime.cpp create mode 100644 libc/src/time/clock_settime.h create mode 100644 libc/src/time/linux/clock_settime.cpp create mode 100644 libc/test/src/time/clock_settime_test.cpp diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index 144237aee7f93..42571862b24b2 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -1144,6 +1144,7 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.time.ctime_r libc.src.time.clock libc.src.time.clock_gettime + libc.src.time.clock_settime libc.src.time.difftime libc.src.time.gettimeofday libc.src.time.gmtime diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index 0d031d8844f13..b62a46b7178d5 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -1273,6 +1273,7 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.time.ctime_r libc.src.time.clock libc.src.time.clock_gettime + libc.src.time.clock_settime libc.src.time.difftime libc.src.time.gettimeofday libc.src.time.gmtime diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index f4012514fe20e..8a46a7a1baae3 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -1314,6 +1314,7 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.time.ctime_r libc.src.time.clock libc.src.time.clock_gettime + libc.src.time.clock_settime libc.src.time.difftime libc.src.time.gettimeofday libc.src.time.gmtime diff --git a/libc/docs/headers/time.rst b/libc/docs/headers/time.rst index 55bc1a17ee285..f07e0d93a4ce6 100644 --- a/libc/docs/headers/time.rst +++ b/libc/docs/headers/time.rst @@ -67,11 +67,11 @@ Implementation Status +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | clock_getres | | | | | | | | | | | | | | +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ -| clock_gettime | |check| | |check| | | |check| | | | | | | | | | | +| clock_gettime | |check| | |check| | | |check| | | | | | | | | |check| | |check| | +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | clock_nanosleep | | | | | | | | | | | | | | +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ -| clock_settime | | | | | | | | | | | | | | +| clock_settime | |check| | |check| | | |check| | | | | | | | | | | +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | ctime | |check| | |check| | | |check| | | | | | | | | | | +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ diff --git a/libc/include/time.yaml b/libc/include/time.yaml index 88e50d1288238..c2b8a1e4cfb8e 100644 --- a/libc/include/time.yaml +++ b/libc/include/time.yaml @@ -67,6 +67,13 @@ functions: arguments: - type: clockid_t - type: struct timespec * + - name: clock_settime + standard: + - POSIX + return_type: int + arguments: + - type: clockid_t + - type: const struct timespec * - name: difftime standard: - stdc diff --git a/libc/src/__support/time/CMakeLists.txt b/libc/src/__support/time/CMakeLists.txt index 8247e792e8410..3851037e4161f 100644 --- a/libc/src/__support/time/CMakeLists.txt +++ b/libc/src/__support/time/CMakeLists.txt @@ -19,3 +19,12 @@ add_object_library( DEPENDS libc.src.__support.time.${LIBC_TARGET_OS}.clock_gettime ) + +if(TARGET libc.src.__support.time.${LIBC_TARGET_OS}.clock_settime) + add_object_library( + clock_settime + ALIAS + DEPENDS + libc.src.__support.time.${LIBC_TARGET_OS}.clock_settime + ) +endif() diff --git a/libc/src/__support/time/clock_settime.h b/libc/src/__support/time/clock_settime.h new file mode 100644 index 0000000000000..d8d305cadf4b9 --- /dev/null +++ b/libc/src/__support/time/clock_settime.h @@ -0,0 +1,22 @@ +//===--- clock_settime linux implementation ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_TIME_CLOCK_SETTIME_H +#define LLVM_LIBC_SRC___SUPPORT_TIME_CLOCK_SETTIME_H + +#include "hdr/types/clockid_t.h" +#include "hdr/types/struct_timespec.h" +#include "src/__support/error_or.h" + +namespace LIBC_NAMESPACE_DECL { +namespace internal { +ErrorOr clock_settime(clockid_t clockid, const timespec *ts); +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_TIME_CLOCK_SETTIME_H diff --git a/libc/src/__support/time/linux/CMakeLists.txt b/libc/src/__support/time/linux/CMakeLists.txt index 6fec7eeba99ad..478529502b403 100644 --- a/libc/src/__support/time/linux/CMakeLists.txt +++ b/libc/src/__support/time/linux/CMakeLists.txt @@ -14,6 +14,21 @@ add_object_library( libc.src.__support.OSUtil.linux.vdso ) +add_object_library( + clock_settime + HDRS + ../clock_settime.h + SRCS + clock_settime.cpp + DEPENDS + libc.include.sys_syscall + libc.hdr.types.struct_timespec + libc.hdr.types.clockid_t + libc.src.__support.common + libc.src.__support.error_or + libc.src.__support.OSUtil.osutil +) + add_header_library( clock_conversion HDRS diff --git a/libc/src/__support/time/linux/clock_settime.cpp b/libc/src/__support/time/linux/clock_settime.cpp new file mode 100644 index 0000000000000..dd42610adb031 --- /dev/null +++ b/libc/src/__support/time/linux/clock_settime.cpp @@ -0,0 +1,53 @@ +//===--- clock_settime linux implementation ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/time/clock_settime.h" +#include "hdr/types/clockid_t.h" +#include "hdr/types/struct_timespec.h" +#include "src/__support/OSUtil/syscall.h" +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include + +#if defined(SYS_clock_settime64) +#include +#endif + +namespace LIBC_NAMESPACE_DECL { +namespace internal { +ErrorOr clock_settime(clockid_t clockid, const timespec *ts) { + int ret; +#if defined(SYS_clock_settime) + ret = LIBC_NAMESPACE::syscall_impl(SYS_clock_settime, + static_cast(clockid), + reinterpret_cast(ts)); +#elif defined(SYS_clock_settime64) + static_assert( + sizeof(time_t) == sizeof(int64_t), + "SYS_clock_settime64 requires struct timespec with 64-bit members."); + + __kernel_timespec ts64{}; + + // Populate the 64-bit kernel structure from the user-provided timespec + ts64.tv_sec = static_cast(ts->tv_sec); + ts64.tv_nsec = static_cast(ts->tv_nsec); + + ret = LIBC_NAMESPACE::syscall_impl(SYS_clock_settime64, + static_cast(clockid), + reinterpret_cast(&ts64)); +#else +#error "SYS_clock_settime and SYS_clock_settime64 syscalls not available." +#endif + if (ret < 0) + return Error(-ret); + return ret; +} + +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/time/CMakeLists.txt b/libc/src/time/CMakeLists.txt index ec942e38d1af5..4d647c22c3239 100644 --- a/libc/src/time/CMakeLists.txt +++ b/libc/src/time/CMakeLists.txt @@ -245,3 +245,11 @@ add_entrypoint_object( DEPENDS .${LIBC_TARGET_OS}.clock_getres ) + +add_entrypoint_object( + clock_settime + ALIAS + DEPENDS + .${LIBC_TARGET_OS}.clock_settime +) + diff --git a/libc/src/time/clock_settime.h b/libc/src/time/clock_settime.h new file mode 100644 index 0000000000000..9321dd1074101 --- /dev/null +++ b/libc/src/time/clock_settime.h @@ -0,0 +1,22 @@ +//===-- Implementation header for clock_settime function --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_TIME_CLOCK_SETTIME_H +#define LLVM_LIBC_SRC_TIME_CLOCK_SETTIME_H + +#include "hdr/types/clockid_t.h" +#include "hdr/types/struct_timespec.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int clock_settime(clockid_t clockid, const timespec *tp); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_TIME_CLOCK_SETTIME_H diff --git a/libc/src/time/linux/CMakeLists.txt b/libc/src/time/linux/CMakeLists.txt index a6ec7c7c06963..6ea04597063cb 100644 --- a/libc/src/time/linux/CMakeLists.txt +++ b/libc/src/time/linux/CMakeLists.txt @@ -54,6 +54,19 @@ add_entrypoint_object( libc.src.errno.errno ) +add_entrypoint_object( + clock_settime + SRCS + clock_settime.cpp + HDRS + ../clock_settime.h + DEPENDS + libc.hdr.types.clockid_t + libc.hdr.types.struct_timespec + libc.src.__support.time.clock_settime + libc.src.errno.errno +) + add_entrypoint_object( gettimeofday SRCS diff --git a/libc/src/time/linux/clock.cpp b/libc/src/time/linux/clock.cpp index c38697cd0668e..c560bd10be83c 100644 --- a/libc/src/time/linux/clock.cpp +++ b/libc/src/time/linux/clock.cpp @@ -19,7 +19,7 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(clock_t, clock, ()) { using namespace time_units; - struct timespec ts; + timespec ts; auto result = internal::clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); if (!result.has_value()) { libc_errno = result.error(); diff --git a/libc/src/time/linux/clock_gettime.cpp b/libc/src/time/linux/clock_gettime.cpp index b3fcd2b22f9da..52ace2a743dd4 100644 --- a/libc/src/time/linux/clock_gettime.cpp +++ b/libc/src/time/linux/clock_gettime.cpp @@ -15,8 +15,7 @@ namespace LIBC_NAMESPACE_DECL { // TODO(michaelrj): Move this into time/linux with the other syscalls. -LLVM_LIBC_FUNCTION(int, clock_gettime, - (clockid_t clockid, struct timespec *ts)) { +LLVM_LIBC_FUNCTION(int, clock_gettime, (clockid_t clockid, timespec *ts)) { auto result = internal::clock_gettime(clockid, ts); // A negative return value indicates an error with the magnitude of the diff --git a/libc/src/time/linux/clock_settime.cpp b/libc/src/time/linux/clock_settime.cpp new file mode 100644 index 0000000000000..3c582cf0b4646 --- /dev/null +++ b/libc/src/time/linux/clock_settime.cpp @@ -0,0 +1,30 @@ +//===---------- Linux implementation of the POSIX clock_settime function --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/time/clock_settime.h" +#include "src/__support/common.h" +#include "src/__support/libc_errno.h" +#include "src/__support/macros/config.h" +#include "src/__support/time/clock_settime.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, clock_settime, + (clockid_t clockid, const timespec *ts)) { + auto result = internal::clock_settime(clockid, ts); + + // A negative return value indicates an error with the magnitude of the + // value being the error code. + if (!result.has_value()) { + libc_errno = result.error(); + return -1; + } + return 0; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/time/linux/nanosleep.cpp b/libc/src/time/linux/nanosleep.cpp index e5df1585df988..a30b97de40492 100644 --- a/libc/src/time/linux/nanosleep.cpp +++ b/libc/src/time/linux/nanosleep.cpp @@ -18,8 +18,7 @@ namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(int, nanosleep, - (const struct timespec *req, struct timespec *rem)) { +LLVM_LIBC_FUNCTION(int, nanosleep, (const timespec *req, timespec *rem)) { #if SYS_nanosleep int ret = LIBC_NAMESPACE::syscall_impl(SYS_nanosleep, req, rem); #elif defined(SYS_clock_nanosleep_time64) diff --git a/libc/src/time/linux/timespec_get.cpp b/libc/src/time/linux/timespec_get.cpp index a4d4372332732..031cb9f83b1c3 100644 --- a/libc/src/time/linux/timespec_get.cpp +++ b/libc/src/time/linux/timespec_get.cpp @@ -15,7 +15,7 @@ namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(int, timespec_get, (struct timespec * ts, int base)) { +LLVM_LIBC_FUNCTION(int, timespec_get, (timespec * ts, int base)) { clockid_t clockid; switch (base) { case TIME_UTC: diff --git a/libc/test/src/time/CMakeLists.txt b/libc/test/src/time/CMakeLists.txt index 03e5428292418..c8e113f06d50b 100644 --- a/libc/test/src/time/CMakeLists.txt +++ b/libc/test/src/time/CMakeLists.txt @@ -124,6 +124,21 @@ add_libc_test( libc.src.time.clock_getres ) +add_libc_test( + clock_settime_test + SUITE + libc_time_unittests + SRCS + clock_settime_test.cpp + DEPENDS + libc.src.time.clock_settime + libc.hdr.types.time_t + libc.hdr.types.struct_timespec + libc.hdr.time_macros + libc.hdr.errno_macros + libc.test.UnitTest.ErrnoCheckingTest +) + add_libc_unittest( difftime_test SUITE diff --git a/libc/test/src/time/clock_settime_test.cpp b/libc/test/src/time/clock_settime_test.cpp new file mode 100644 index 0000000000000..ccbad9ed2e847 --- /dev/null +++ b/libc/test/src/time/clock_settime_test.cpp @@ -0,0 +1,54 @@ +//===-- Unittests for clock_settime ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/time_macros.h" +#include "hdr/types/struct_timespec.h" +#include "src/time/clock_settime.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcClockSetTime = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +#ifdef CLOCK_MONOTONIC +TEST_F(LlvmLibcClockSetTime, MonotonicIsNotSettable) { + timespec ts = {0, 0}; + int result = LIBC_NAMESPACE::clock_settime(CLOCK_MONOTONIC, &ts); + ASSERT_EQ(result, -1); + ASSERT_ERRNO_EQ(EINVAL); +} +#endif // CLOCK_MONOTONIC + +TEST_F(LlvmLibcClockSetTime, InvalidClockId) { + timespec ts = {0, 0}; + int result = LIBC_NAMESPACE::clock_settime(static_cast(-1), &ts); + ASSERT_EQ(result, -1); + ASSERT_ERRNO_EQ(EINVAL); +} + +TEST_F(LlvmLibcClockSetTime, InvalidTimespecNsec) { + timespec ts = {0, 1000000000L}; + int result = LIBC_NAMESPACE::clock_settime(CLOCK_REALTIME, &ts); + ASSERT_EQ(result, -1); + ASSERT_ERRNO_EQ(EINVAL); +} + +TEST_F(LlvmLibcClockSetTime, NullPointerIsEFAULT) { + int result = LIBC_NAMESPACE::clock_settime(CLOCK_REALTIME, nullptr); + ASSERT_EQ(result, -1); + ASSERT_ERRNO_EQ(EFAULT); +} + +TEST_F(LlvmLibcClockSetTime, ClockIsSet) { + timespec ts = {0, 0}; + int result = LIBC_NAMESPACE::clock_settime(CLOCK_REALTIME, &ts); + if (result == 0) { + ASSERT_ERRNO_SUCCESS(); + } else { + ASSERT_ERRNO_EQ(EPERM); + } +} From b7e35ccd3abf27e8590b675dbcec8ffc013dbc52 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 11 Nov 2025 12:51:13 -0500 Subject: [PATCH 13/64] [gn] port f63d33da0a51 (clangOptions) --- .../clang-tools-extra/clangd/BUILD.gn | 1 + .../clang-tools-extra/modularize/BUILD.gn | 1 + .../clang-tools-extra/pp-trace/BUILD.gn | 1 + .../clang/{Driver => Options}/BUILD.gn | 0 .../gn/secondary/clang/lib/Driver/BUILD.gn | 6 ------ .../gn/secondary/clang/lib/Frontend/BUILD.gn | 1 + .../secondary/clang/lib/FrontendTool/BUILD.gn | 1 + .../gn/secondary/clang/lib/Options/BUILD.gn | 19 +++++++++++++++++++ .../gn/secondary/clang/lib/Tooling/BUILD.gn | 3 ++- .../clang/tools/clang-check/BUILD.gn | 1 + .../clang/tools/clang-installapi/BUILD.gn | 1 + .../gn/secondary/clang/tools/driver/BUILD.gn | 1 + 12 files changed, 29 insertions(+), 7 deletions(-) rename llvm/utils/gn/secondary/clang/include/clang/{Driver => Options}/BUILD.gn (100%) create mode 100644 llvm/utils/gn/secondary/clang/lib/Options/BUILD.gn diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn index f8c4838ab7ee3..bd225dd1e5656 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn @@ -52,6 +52,7 @@ static_library("clangd") { "//clang/lib/Frontend", "//clang/lib/Index", "//clang/lib/Lex", + "//clang/lib/Options", "//clang/lib/Sema", "//clang/lib/Serialization", "//clang/lib/Tooling", diff --git a/llvm/utils/gn/secondary/clang-tools-extra/modularize/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/modularize/BUILD.gn index 1d01bd8e87d37..8f1c1d8bb0ed7 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/modularize/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/modularize/BUILD.gn @@ -6,6 +6,7 @@ executable("modularize") { "//clang/lib/Driver", "//clang/lib/Frontend", "//clang/lib/Lex", + "//clang/lib/Options", "//clang/lib/Serialization", "//clang/lib/Tooling", "//llvm/lib/Option", diff --git a/llvm/utils/gn/secondary/clang-tools-extra/pp-trace/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/pp-trace/BUILD.gn index 2c0eac074d355..5acee9dd52518 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/pp-trace/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/pp-trace/BUILD.gn @@ -5,6 +5,7 @@ executable("pp-trace") { "//clang/lib/Basic", "//clang/lib/Frontend", "//clang/lib/Lex", + "//clang/lib/Options", "//clang/lib/Serialization", "//clang/lib/Tooling", "//llvm/lib/Support", diff --git a/llvm/utils/gn/secondary/clang/include/clang/Driver/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Options/BUILD.gn similarity index 100% rename from llvm/utils/gn/secondary/clang/include/clang/Driver/BUILD.gn rename to llvm/utils/gn/secondary/clang/include/clang/Options/BUILD.gn diff --git a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn index e1f31179b9a20..8cc4c78bf569a 100644 --- a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn @@ -21,10 +21,6 @@ static_library("Driver") { "//llvm/lib/TargetParser", "//llvm/lib/WindowsDriver", ] - public_deps = [ - # public_dep because public header Options.h includes generated Options.inc. - "//clang/include/clang/Driver:Options", - ] if (host_os == "win") { # MSVCToolChain.cpp uses version.dll. libs = [ "version.lib" ] @@ -34,12 +30,10 @@ static_library("Driver") { "Compilation.cpp", "Distro.cpp", "Driver.cpp", - "DriverOptions.cpp", "Job.cpp", "Multilib.cpp", "MultilibBuilder.cpp", "OffloadBundler.cpp", - "OptionUtils.cpp", "Phases.cpp", "SanitizerArgs.cpp", "Tool.cpp", diff --git a/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn index 948d1405676b7..4009cfc609f4a 100644 --- a/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn @@ -9,6 +9,7 @@ static_library("Frontend") { "//clang/lib/Driver", "//clang/lib/Edit", "//clang/lib/Lex", + "//clang/lib/Options", "//clang/lib/Parse", "//clang/lib/Sema", "//clang/lib/Serialization", diff --git a/llvm/utils/gn/secondary/clang/lib/FrontendTool/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/FrontendTool/BUILD.gn index 9f7140ed391dd..707eabf1af70b 100644 --- a/llvm/utils/gn/secondary/clang/lib/FrontendTool/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/FrontendTool/BUILD.gn @@ -11,6 +11,7 @@ static_library("FrontendTool") { "//clang/lib/ExtractAPI", "//clang/lib/Frontend", "//clang/lib/Frontend/Rewrite", + "//clang/lib/Options", "//llvm/lib/Option", "//llvm/lib/Support", ] diff --git a/llvm/utils/gn/secondary/clang/lib/Options/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Options/BUILD.gn new file mode 100644 index 0000000000000..3f022ed7f7480 --- /dev/null +++ b/llvm/utils/gn/secondary/clang/lib/Options/BUILD.gn @@ -0,0 +1,19 @@ +static_library("Options") { + output_name = "clangOptions" + configs += [ "//llvm/utils/gn/build:clang_code" ] + include_dirs = [ "." ] + deps = [ + "//clang/include/clang/Config", + "//clang/lib/Basic", + "//llvm/lib/Option", + "//llvm/lib/Support", + ] + public_deps = [ + # public_dep because public header Options.h includes generated Options.inc. + "//clang/include/clang/Options:Options", + ] + sources = [ + "DriverOptions.cpp", + "OptionUtils.cpp", + ] +} diff --git a/llvm/utils/gn/secondary/clang/lib/Tooling/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Tooling/BUILD.gn index 0087aad982672..5ec5500010b39 100644 --- a/llvm/utils/gn/secondary/clang/lib/Tooling/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Tooling/BUILD.gn @@ -2,7 +2,7 @@ static_library("Tooling") { output_name = "clangTooling" configs += [ "//llvm/utils/gn/build:clang_code" ] deps = [ - "//clang/include/clang/Driver:Options", + "//clang/include/clang/Options:Options", "//clang/lib/AST", "//clang/lib/ASTMatchers", "//clang/lib/Basic", @@ -10,6 +10,7 @@ static_library("Tooling") { "//clang/lib/Format", "//clang/lib/Frontend", "//clang/lib/Lex", + "//clang/lib/Options", "//clang/lib/Rewrite", "//clang/lib/Tooling/Core", "//llvm/lib/TargetParser", diff --git a/llvm/utils/gn/secondary/clang/tools/clang-check/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/clang-check/BUILD.gn index 39a92f6ac921f..dd796d42038b1 100644 --- a/llvm/utils/gn/secondary/clang/tools/clang-check/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/tools/clang-check/BUILD.gn @@ -6,6 +6,7 @@ executable("clang-check") { "//clang/lib/Driver", "//clang/lib/Frontend", "//clang/lib/Frontend/Rewrite", + "//clang/lib/Options", "//clang/lib/StaticAnalyzer/Frontend", "//clang/lib/Tooling", "//clang/lib/Tooling/Syntax", diff --git a/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn index 89f0107686f91..774402a9931a9 100644 --- a/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn @@ -13,6 +13,7 @@ driver_executable("clang-installapi") { "//clang/lib/Driver", "//clang/lib/Frontend", "//clang/lib/InstallAPI", + "//clang/lib/Options", "//clang/lib/Tooling", "//llvm/lib/Support", "//llvm/lib/TargetParser", diff --git a/llvm/utils/gn/secondary/clang/tools/driver/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/driver/BUILD.gn index 54fca3bf1f50b..b402bbba59ca0 100644 --- a/llvm/utils/gn/secondary/clang/tools/driver/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/tools/driver/BUILD.gn @@ -57,6 +57,7 @@ driver_executable("clang") { "//clang/lib/Frontend", "//clang/lib/FrontendTool", "//clang/lib/Headers", + "//clang/lib/Options", "//clang/tools/clang-linker-wrapper", "//clang/tools/clang-nvlink-wrapper", "//clang/tools/clang-offload-bundler", From 4d2741361a3e1cebbcd7abb28b7abdcbe74c91b7 Mon Sep 17 00:00:00 2001 From: Alex Langford Date: Tue, 11 Nov 2025 10:00:42 -0800 Subject: [PATCH 14/64] [lldb] Fix reading duplicate objc class metdata from shared cache (#167405) The code for v16 of the shared cache objc class layout was copy/pasted from the previous versions incorrectly. Namely, the wrong class offset list was used and the class_infos index was never updated. rdar://164430695 --- .../ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp index 9beb133f5595f..9fff4adbff79d 100644 --- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp +++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp @@ -537,7 +537,7 @@ __lldb_apple_objc_v2_get_shared_cache_class_info (void *objc_opt_ro_ptr, for (uint32_t i=0; iversion >= 12 && objc_opt->version <= 15) From 42750194fb4acccd5d4ceb0c7bb04743eeebe3e6 Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Wed, 12 Nov 2025 03:11:23 +0900 Subject: [PATCH 15/64] [clang][SPIRV] Don't addrspacecast nullptr for function pointer types (#167379) According to the [spec](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/spirv-extensions/SPV_INTEL_function_pointers.asciidoc), it is illegal to addrspacecast to the generic AS, so use the function pointer AS for null constants. "It is illegal to use Function Pointer as 'Pointer' argument of OpPtrCastToGeneric." This was found when compiling the OpenMP Device RTL for SPIR-V. Signed-off-by: Nick Sarnie --- clang/lib/CodeGen/Targets/SPIR.cpp | 10 +++++++++- clang/test/CodeGenSPIRV/spirv-intel.c | 12 ++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp index abd049aca0ed7..161a944b16bda 100644 --- a/clang/lib/CodeGen/Targets/SPIR.cpp +++ b/clang/lib/CodeGen/Targets/SPIR.cpp @@ -260,8 +260,16 @@ CommonSPIRTargetCodeGenInfo::getNullPointer(const CodeGen::CodeGenModule &CGM, LangAS AS = QT->getUnqualifiedDesugaredType()->isNullPtrType() ? LangAS::Default : QT->getPointeeType().getAddressSpace(); + unsigned ASAsInt = static_cast(AS); + unsigned FirstTargetASAsInt = + static_cast(LangAS::FirstTargetAddressSpace); + unsigned CodeSectionINTELAS = FirstTargetASAsInt + 9; + // As per SPV_INTEL_function_pointers, it is illegal to addrspacecast + // function pointers to/from the generic AS. + bool IsFunctionPtrAS = + CGM.getTriple().isSPIRV() && ASAsInt == CodeSectionINTELAS; if (AS == LangAS::Default || AS == LangAS::opencl_generic || - AS == LangAS::opencl_constant) + AS == LangAS::opencl_constant || IsFunctionPtrAS) return llvm::ConstantPointerNull::get(PT); auto &Ctx = CGM.getContext(); diff --git a/clang/test/CodeGenSPIRV/spirv-intel.c b/clang/test/CodeGenSPIRV/spirv-intel.c index 997cd6f10b90c..f00fc97adaec7 100644 --- a/clang/test/CodeGenSPIRV/spirv-intel.c +++ b/clang/test/CodeGenSPIRV/spirv-intel.c @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -triple spirv64-intel %s -emit-llvm -o - | FileCheck -check-prefix=CHECK-WITH-64 %s -// RUN: %clang_cc1 -triple spirv32-intel %s -emit-llvm -o - | FileCheck -check-prefix=CHECK-WITH-32 %s +// RUN: %clang_cc1 -triple spirv64-intel %s -emit-llvm -o - | FileCheck -check-prefixes=CHECK-WITH,CHECK-WITH-64 %s +// RUN: %clang_cc1 -triple spirv32-intel %s -emit-llvm -o - | FileCheck -check-prefixes=CHECK-WITH,CHECK-WITH-32 %s // RUN: %clang_cc1 -triple spir-intel %s -emit-llvm -o - | FileCheck -check-prefix=CHECK-WITHOUT %s // RUN: %clang_cc1 -triple spir64-intel %s -emit-llvm -o - | FileCheck -check-prefix=CHECK-WITHOUT %s @@ -9,3 +9,11 @@ // CHECK-WITHOUT: spir_func void @foo(ptr noundef %param) #0 { void foo(int *param) { } + +typedef __attribute__((address_space(9))) void * FnPtrTy; + +// CHECK-WITH: %{{.*}} = icmp eq ptr addrspace(9) %{{.*}}, null +int bar() { + FnPtrTy FnPtr = (FnPtrTy)foo; + return FnPtr == 0; +} From 4a81e426f0b08b64973b8f2d561fba32c66a74b1 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 11 Nov 2025 13:16:24 -0500 Subject: [PATCH 16/64] [gn] port f63d33da0a51 more --- llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn index 8cc4c78bf569a..66dbf6152472a 100644 --- a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn @@ -14,6 +14,7 @@ static_library("Driver") { # See the review thread of r311958 for details. "//clang/include/clang/StaticAnalyzer/Checkers", "//clang/lib/Basic", + "//clang/lib/Options", "//llvm/include/llvm/Config:llvm-config", "//llvm/lib/BinaryFormat", "//llvm/lib/Option", From cf6b443c688971161542ef2d93e9fd67ffd21f8c Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Tue, 11 Nov 2025 10:22:40 -0800 Subject: [PATCH 17/64] [Clang] Consider reachability for file-scope warnings on initializers (#163885) Co-authored-by: Nathan Huckleberry --- clang/docs/ReleaseNotes.rst | 3 + clang/include/clang/Parse/Parser.h | 6 +- .../clang/Sema/AnalysisBasedWarnings.h | 10 ++ clang/include/clang/Sema/Sema.h | 5 + clang/lib/Analysis/AnalysisDeclContext.cpp | 5 + clang/lib/Parse/ParseDecl.cpp | 2 +- clang/lib/Parse/ParseDeclCXX.cpp | 2 +- clang/lib/Parse/ParseInit.cpp | 23 ++++ clang/lib/Parse/ParseOpenMP.cpp | 2 +- clang/lib/Sema/AnalysisBasedWarnings.cpp | 105 +++++++++++------- clang/lib/Sema/SemaDecl.cpp | 17 +++ clang/lib/Sema/SemaExpr.cpp | 43 +++---- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 6 + clang/test/Sema/warn-unreachable-file-scope.c | 37 ++++++ 14 files changed, 200 insertions(+), 66 deletions(-) create mode 100644 clang/test/Sema/warn-unreachable-file-scope.c diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 980dbf1ff2cf6..5b95b44ea9450 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -345,6 +345,9 @@ Improvements to Clang's diagnostics ----------------------------------- - Diagnostics messages now refer to ``structured binding`` instead of ``decomposition``, to align with `P0615R0 `_ changing the term. (#GH157880) +- Clang now suppresses runtime behavior warnings for unreachable code in file-scope + variable initializers, matching the behavior for functions. This prevents false + positives for operations in unreachable branches of constant expressions. - Added a separate diagnostic group ``-Wfunction-effect-redeclarations``, for the more pedantic diagnostics for function effects (``[[clang::nonblocking]]`` and ``[[clang::nonallocating]]``). Moved the warning for a missing (though implied) attribute on a redeclaration into this group. diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index dad8efd0f017f..58eb1c0a7c114 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -5223,11 +5223,7 @@ class Parser : public CodeCompletionHandler { /// assignment-expression /// '{' ... /// \endverbatim - ExprResult ParseInitializer() { - if (Tok.isNot(tok::l_brace)) - return ParseAssignmentExpression(); - return ParseBraceInitializer(); - } + ExprResult ParseInitializer(Decl *DeclForInitializer = nullptr); /// MayBeDesignationStart - Return true if the current token might be the /// start of a designator. If we can tell it is impossible that it is a diff --git a/clang/include/clang/Sema/AnalysisBasedWarnings.h b/clang/include/clang/Sema/AnalysisBasedWarnings.h index 4103c3f006a8f..20a2030f56034 100644 --- a/clang/include/clang/Sema/AnalysisBasedWarnings.h +++ b/clang/include/clang/Sema/AnalysisBasedWarnings.h @@ -14,15 +14,19 @@ #define LLVM_CLANG_SEMA_ANALYSISBASEDWARNINGS_H #include "clang/AST/Decl.h" +#include "clang/Sema/ScopeInfo.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/MapVector.h" #include namespace clang { +class AnalysisDeclContext; class Decl; class FunctionDecl; class QualType; class Sema; +class VarDecl; namespace sema { class FunctionScopeInfo; class SemaPPCallbacks; @@ -57,6 +61,8 @@ class AnalysisBasedWarnings { enum VisitFlag { NotVisited = 0, Visited = 1, Pending = 2 }; llvm::DenseMap VisitedFD; + std::multimap + VarDeclPossiblyUnreachableDiags; Policy PolicyOverrides; void clearOverrides(); @@ -107,6 +113,10 @@ class AnalysisBasedWarnings { // Issue warnings that require whole-translation-unit analysis. void IssueWarnings(TranslationUnitDecl *D); + void registerVarDeclWarning(VarDecl *VD, PossiblyUnreachableDiag PUD); + + void issueWarningsForRegisteredVarDecl(VarDecl *VD); + // Gets the default policy which is in effect at the given source location. Policy getPolicyInEffectAt(SourceLocation Loc); diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 0470645a9e7ad..163ab32fafa48 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -6756,6 +6756,11 @@ class Sema final : public SemaBase { /// suffice, e.g., in a default function argument. Decl *ManglingContextDecl; + /// Declaration for initializer if one is currently being + /// parsed. Used when an expression has a possibly unreachable + /// diagnostic to reference the declaration as a whole. + VarDecl *DeclForInitializer = nullptr; + /// If we are processing a decltype type, a set of call expressions /// for which we have deferred checking the completeness of the return type. SmallVector DelayedDecltypeCalls; diff --git a/clang/lib/Analysis/AnalysisDeclContext.cpp b/clang/lib/Analysis/AnalysisDeclContext.cpp index 5a52056f3e6a5..f188fc6921ed1 100644 --- a/clang/lib/Analysis/AnalysisDeclContext.cpp +++ b/clang/lib/Analysis/AnalysisDeclContext.cpp @@ -117,6 +117,11 @@ Stmt *AnalysisDeclContext::getBody(bool &IsAutosynthesized) const { return BD->getBody(); else if (const auto *FunTmpl = dyn_cast_or_null(D)) return FunTmpl->getTemplatedDecl()->getBody(); + else if (const auto *VD = dyn_cast_or_null(D)) { + if (VD->isFileVarDecl()) { + return const_cast(dyn_cast_or_null(VD->getInit())); + } + } llvm_unreachable("unknown code decl"); } diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 5fcb659768655..8688ccf41acb5 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -2613,7 +2613,7 @@ Decl *Parser::ParseDeclarationAfterDeclaratorAndAttributes( } PreferredType.enterVariableInit(Tok.getLocation(), ThisDecl); - ExprResult Init = ParseInitializer(); + ExprResult Init = ParseInitializer(ThisDecl); // If this is the only decl in (possibly) range based for statement, // our best guess is that the user meant ':' instead of '='. diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp index b96968d4592f5..d8ed7e3ff96bd 100644 --- a/clang/lib/Parse/ParseDeclCXX.cpp +++ b/clang/lib/Parse/ParseDeclCXX.cpp @@ -3359,7 +3359,7 @@ ExprResult Parser::ParseCXXMemberInitializer(Decl *D, bool IsFunction, Diag(Tok, diag::err_ms_property_initializer) << PD; return ExprError(); } - return ParseInitializer(); + return ParseInitializer(D); } void Parser::SkipCXXMemberSpecification(SourceLocation RecordLoc, diff --git a/clang/lib/Parse/ParseInit.cpp b/clang/lib/Parse/ParseInit.cpp index a3be3744a9327..0e86c4c48d5e4 100644 --- a/clang/lib/Parse/ParseInit.cpp +++ b/clang/lib/Parse/ParseInit.cpp @@ -581,3 +581,26 @@ bool Parser::ParseMicrosoftIfExistsBraceInitializer(ExprVector &InitExprs, return !trailingComma; } + +ExprResult Parser::ParseInitializer(Decl *DeclForInitializer) { + // Set DeclForInitializer for file-scope variables. + // For constexpr references, set it to suppress runtime warnings. + // For non-constexpr references, don't set it to avoid evaluation issues + // with self-referencing initializers. Local variables (including local + // constexpr) should emit runtime warnings. + if (DeclForInitializer && !Actions.ExprEvalContexts.empty()) { + if (auto *VD = dyn_cast(DeclForInitializer); + VD && VD->isFileVarDecl() && + (!VD->getType()->isReferenceType() || VD->isConstexpr())) + Actions.ExprEvalContexts.back().DeclForInitializer = VD; + } + + ExprResult init; + if (Tok.isNot(tok::l_brace)) { + init = ParseAssignmentExpression(); + } else { + init = ParseBraceInitializer(); + } + + return init; +} diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp index 334438edfc2e8..32a406e2c065f 100644 --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -339,7 +339,7 @@ void Parser::ParseOpenMPReductionInitializerForDecl(VarDecl *OmpPrivParm) { } PreferredType.enterVariableInit(Tok.getLocation(), OmpPrivParm); - ExprResult Init = ParseInitializer(); + ExprResult Init = ParseInitializer(OmpPrivParm); if (Init.isInvalid()) { SkipUntil(tok::r_paren, tok::annot_pragma_openmp_end, StopBeforeMatch); diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp index 140b709dbb651..41a98323450e4 100644 --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp @@ -2734,6 +2734,70 @@ static void flushDiagnostics(Sema &S, const sema::FunctionScopeInfo *fscope) { S.Diag(D.Loc, D.PD); } +template +static void emitPossiblyUnreachableDiags(Sema &S, AnalysisDeclContext &AC, + std::pair PUDs) { + + if (PUDs.first == PUDs.second) + return; + + for (auto I = PUDs.first; I != PUDs.second; ++I) { + for (const Stmt *S : I->Stmts) + AC.registerForcedBlockExpression(S); + } + + if (AC.getCFG()) { + CFGReverseBlockReachabilityAnalysis *Analysis = + AC.getCFGReachablityAnalysis(); + + for (auto I = PUDs.first; I != PUDs.second; ++I) { + const auto &D = *I; + if (llvm::all_of(D.Stmts, [&](const Stmt *St) { + const CFGBlock *Block = AC.getBlockForRegisteredExpression(St); + // FIXME: We should be able to assert that block is non-null, but + // the CFG analysis can skip potentially-evaluated expressions in + // edge cases; see test/Sema/vla-2.c. + if (Block && Analysis) + if (!Analysis->isReachable(&AC.getCFG()->getEntry(), Block)) + return false; + return true; + })) { + S.Diag(D.Loc, D.PD); + } + } + } else { + for (auto I = PUDs.first; I != PUDs.second; ++I) + S.Diag(I->Loc, I->PD); + } +} + +void sema::AnalysisBasedWarnings::registerVarDeclWarning( + VarDecl *VD, clang::sema::PossiblyUnreachableDiag PUD) { + VarDeclPossiblyUnreachableDiags.emplace(VD, PUD); +} + +void sema::AnalysisBasedWarnings::issueWarningsForRegisteredVarDecl( + VarDecl *VD) { + if (!llvm::is_contained(VarDeclPossiblyUnreachableDiags, VD)) + return; + + AnalysisDeclContext AC(/*Mgr=*/nullptr, VD); + + AC.getCFGBuildOptions().PruneTriviallyFalseEdges = true; + AC.getCFGBuildOptions().AddEHEdges = false; + AC.getCFGBuildOptions().AddInitializers = true; + AC.getCFGBuildOptions().AddImplicitDtors = true; + AC.getCFGBuildOptions().AddTemporaryDtors = true; + AC.getCFGBuildOptions().AddCXXNewAllocator = false; + AC.getCFGBuildOptions().AddCXXDefaultInitExprInCtors = true; + + auto Range = VarDeclPossiblyUnreachableDiags.equal_range(VD); + auto SecondRange = + llvm::make_second_range(llvm::make_range(Range.first, Range.second)); + emitPossiblyUnreachableDiags( + S, AC, std::make_pair(SecondRange.begin(), SecondRange.end())); +} + // An AST Visitor that calls a callback function on each callable DEFINITION // that is NOT in a dependent context: class CallableVisitor : public DynamicRecursiveASTVisitor { @@ -2945,45 +3009,8 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings( } // Emit delayed diagnostics. - if (!fscope->PossiblyUnreachableDiags.empty()) { - bool analyzed = false; - - // Register the expressions with the CFGBuilder. - for (const auto &D : fscope->PossiblyUnreachableDiags) { - for (const Stmt *S : D.Stmts) - AC.registerForcedBlockExpression(S); - } - - if (AC.getCFG()) { - analyzed = true; - for (const auto &D : fscope->PossiblyUnreachableDiags) { - bool AllReachable = true; - for (const Stmt *S : D.Stmts) { - const CFGBlock *block = AC.getBlockForRegisteredExpression(S); - CFGReverseBlockReachabilityAnalysis *cra = - AC.getCFGReachablityAnalysis(); - // FIXME: We should be able to assert that block is non-null, but - // the CFG analysis can skip potentially-evaluated expressions in - // edge cases; see test/Sema/vla-2.c. - if (block && cra) { - // Can this block be reached from the entrance? - if (!cra->isReachable(&AC.getCFG()->getEntry(), block)) { - AllReachable = false; - break; - } - } - // If we cannot map to a basic block, assume the statement is - // reachable. - } - - if (AllReachable) - S.Diag(D.Loc, D.PD); - } - } - - if (!analyzed) - flushDiagnostics(S, fscope); - } + auto &PUDs = fscope->PossiblyUnreachableDiags; + emitPossiblyUnreachableDiags(S, AC, std::make_pair(PUDs.begin(), PUDs.end())); // Warning: check missing 'return' if (P.enableCheckFallThrough) { diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 086dd8ba1c670..25b89d65847ad 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -59,6 +59,7 @@ #include "clang/Sema/SemaWasm.h" #include "clang/Sema/Template.h" #include "llvm/ADT/STLForwardCompat.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" @@ -13117,6 +13118,13 @@ namespace { if (isa(OrigDecl)) return; + // Skip checking for file-scope constexpr variables - constant evaluation + // will produce appropriate errors without needing runtime diagnostics. + // Local constexpr should still emit runtime warnings. + if (auto *VD = dyn_cast(OrigDecl); + VD && VD->isConstexpr() && VD->isFileVarDecl()) + return; + E = E->IgnoreParens(); // Skip checking T a = a where T is not a record or reference type. @@ -13744,6 +13752,11 @@ void Sema::DiagnoseUniqueObjectDuplication(const VarDecl *VD) { } void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) { + auto ResetDeclForInitializer = llvm::make_scope_exit([this]() { + if (this->ExprEvalContexts.empty()) + this->ExprEvalContexts.back().DeclForInitializer = nullptr; + }); + // If there is no declaration, there was an error parsing it. Just ignore // the initializer. if (!RealDecl) { @@ -15069,6 +15082,10 @@ void Sema::FinalizeDeclaration(Decl *ThisDecl) { if (!VD) return; + // Emit any deferred warnings for the variable's initializer, even if the + // variable is invalid + AnalysisWarnings.issueWarningsForRegisteredVarDecl(VD); + // Apply an implicit SectionAttr if '#pragma clang section bss|data|rodata' is active if (VD->hasGlobalStorage() && VD->isThisDeclarationADefinition() && !inTemplateInstantiation() && !VD->hasAttr()) { diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 2159a0dc2a5d7..10f0ec3010c6c 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -20565,31 +20565,36 @@ void Sema::MarkDeclarationsReferencedInExpr(Expr *E, } /// Emit a diagnostic when statements are reachable. -/// FIXME: check for reachability even in expressions for which we don't build a -/// CFG (eg, in the initializer of a global or in a constant expression). -/// For example, -/// namespace { auto *p = new double[3][false ? (1, 2) : 3]; } bool Sema::DiagIfReachable(SourceLocation Loc, ArrayRef Stmts, const PartialDiagnostic &PD) { - if (!Stmts.empty() && getCurFunctionOrMethodDecl()) { - if (!FunctionScopes.empty()) - FunctionScopes.back()->PossiblyUnreachableDiags.push_back( - sema::PossiblyUnreachableDiag(PD, Loc, Stmts)); - return true; - } - + VarDecl *Decl = ExprEvalContexts.back().DeclForInitializer; // The initializer of a constexpr variable or of the first declaration of a // static data member is not syntactically a constant evaluated constant, // but nonetheless is always required to be a constant expression, so we // can skip diagnosing. - // FIXME: Using the mangling context here is a hack. - if (auto *VD = dyn_cast_or_null( - ExprEvalContexts.back().ManglingContextDecl)) { - if (VD->isConstexpr() || - (VD->isStaticDataMember() && VD->isFirstDecl() && !VD->isInline())) - return false; - // FIXME: For any other kind of variable, we should build a CFG for its - // initializer and check whether the context in question is reachable. + if (Decl && + (Decl->isConstexpr() || (Decl->isStaticDataMember() && + Decl->isFirstDecl() && !Decl->isInline()))) + return false; + + if (Stmts.empty()) { + Diag(Loc, PD); + return true; + } + + if (getCurFunction()) { + FunctionScopes.back()->PossiblyUnreachableDiags.push_back( + sema::PossiblyUnreachableDiag(PD, Loc, Stmts)); + return true; + } + + // For non-constexpr file-scope variables with reachability context (non-empty + // Stmts), build a CFG for the initializer and check whether the context in + // question is reachable. + if (Decl && Decl->isFileVarDecl()) { + AnalysisWarnings.registerVarDeclWarning( + Decl, sema::PossiblyUnreachableDiag(PD, Loc, Stmts)); + return true; } Diag(Loc, PD); diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 4d58f00168298..a56017cd7b7e7 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -6198,6 +6198,10 @@ void Sema::InstantiateVariableInitializer( currentEvaluationContext().RebuildDefaultArgOrDefaultInit = parentEvaluationContext().RebuildDefaultArgOrDefaultInit; + // Set DeclForInitializer for this variable so DiagIfReachable can properly + // suppress runtime diagnostics for constexpr/static member variables + currentEvaluationContext().DeclForInitializer = Var; + if (OldVar->getInit()) { // Instantiate the initializer. ExprResult Init = @@ -6467,6 +6471,8 @@ void Sema::InstantiateVariableDefinition(SourceLocation PointOfInstantiation, PassToConsumerRAII.Var = Var; Var->setTemplateSpecializationKind(OldVar->getTemplateSpecializationKind(), OldVar->getPointOfInstantiation()); + // Emit any deferred warnings for the variable's initializer + AnalysisWarnings.issueWarningsForRegisteredVarDecl(Var); } // This variable may have local implicit instantiations that need to be diff --git a/clang/test/Sema/warn-unreachable-file-scope.c b/clang/test/Sema/warn-unreachable-file-scope.c new file mode 100644 index 0000000000000..64a6918cbcf77 --- /dev/null +++ b/clang/test/Sema/warn-unreachable-file-scope.c @@ -0,0 +1,37 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s + +typedef unsigned char u8; + +u8 a1 = (0 ? 0xffff : 0xff); +u8 a2 = (1 ? 0xffff : 0xff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}} +u8 a3 = (1 ? 0xff : 0xffff); +u8 a4 = (0 ? 0xff : 0xffff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}} + +unsigned long long b1 = 1 ? 0 : 1ULL << 64; +unsigned long long b2 = 0 ? 0 : 1ULL << 64; // expected-warning {{shift count >= width of type}} +unsigned long long b3 = 1 ? 1ULL << 64 : 0; // expected-warning {{shift count >= width of type}} + +#define M(n) (((n) == 64) ? ~0ULL : ((1ULL << (n)) - 1)) +unsigned long long c1 = M(64); +unsigned long long c2 = M(32); + +static u8 d1 = (0 ? 0xffff : 0xff); +static u8 d2 = (1 ? 0xffff : 0xff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}} + +int a = 1 ? 6 : (1,2); +int b = 0 ? 6 : (1,2); // expected-warning {{left operand of comma operator has no effect}} + +void f(void) { + u8 e1 = (0 ? 0xffff : 0xff); + u8 e2 = (1 ? 0xffff : 0xff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}} + + unsigned long long e3 = 1 ? 0 : 1ULL << 64; + unsigned long long e4 = 0 ? 0 : 1ULL << 64; // expected-warning {{shift count >= width of type}} +} + +void statics(void) { + static u8 f1 = (0 ? 0xffff : 0xff); + static u8 f2 = (1 ? 0xffff : 0xff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}} + static u8 f3 = (1 ? 0xff : 0xffff); + static u8 f4 = (0 ? 0xff : 0xffff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}} +} From 49dc49e12539e7d158ef09355aaf567033b9d057 Mon Sep 17 00:00:00 2001 From: Victor Campos Date: Tue, 11 Nov 2025 18:29:22 +0000 Subject: [PATCH 18/64] [libc][math] Add `asin` to baremetal Arm and AArch64 (#167339) This patch adds `asin` to the entry points for Arm and AArch64. Tests have been run using Arm Toolchain for Embedded, a downstream toolchain. --- libc/config/baremetal/aarch64/entrypoints.txt | 1 + libc/config/baremetal/arm/entrypoints.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/libc/config/baremetal/aarch64/entrypoints.txt b/libc/config/baremetal/aarch64/entrypoints.txt index 049adb34d9d79..c69ab3d0bb37c 100644 --- a/libc/config/baremetal/aarch64/entrypoints.txt +++ b/libc/config/baremetal/aarch64/entrypoints.txt @@ -323,6 +323,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.acos libc.src.math.acosf libc.src.math.acoshf + libc.src.math.asin libc.src.math.asinf libc.src.math.asinhf libc.src.math.atan2 diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt index 2444ec5feff01..c566f8ad08c8e 100644 --- a/libc/config/baremetal/arm/entrypoints.txt +++ b/libc/config/baremetal/arm/entrypoints.txt @@ -326,6 +326,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.acos libc.src.math.acosf libc.src.math.acoshf + libc.src.math.asin libc.src.math.asinf libc.src.math.asinhf libc.src.math.atan2 From 85d2b10838389a01251d8e3c7b046196429bdc5b Mon Sep 17 00:00:00 2001 From: zhijian lin Date: Tue, 11 Nov 2025 13:34:14 -0500 Subject: [PATCH 19/64] [DAG] Make strictfp attribute only restricts for libm and make non-math optimizations possible (#165464) the patch [Add strictfp attribute to prevent unwanted optimizations of libm calls](https://reviews.llvm.org/D34163) add `I.isStrictFP()` into ``` if (!I.isNoBuiltin() && !I.isStrictFP() && !F->hasLocalLinkage() && F->hasName() && LibInfo->getLibFunc(*F, Func) && LibInfo->hasOptimizedCodeGen(Func)) ``` it prevents the backend from optimizing even non-math libcalls such as `strlen` and `memcmp` if a call has the strict floating-point attribute. For example, it prevent converting strlen and memcmp to milicode call __strlen and __memcmp. --- .../SelectionDAG/SelectionDAGBuilder.cpp | 15 ++++---- llvm/test/CodeGen/PowerPC/milicode32.ll | 34 +++++++++++++++++++ 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 6a9022dff41ad..9baf72b266aa7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -9456,7 +9456,9 @@ bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) { bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I, unsigned Opcode) { // We already checked this call's prototype; verify it doesn't modify errno. - if (!I.onlyReadsMemory()) + // Do not perform optimizations for call sites that require strict + // floating-point semantics. + if (!I.onlyReadsMemory() || I.isStrictFP()) return false; SDNodeFlags Flags; @@ -9476,7 +9478,9 @@ bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I, bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I, unsigned Opcode) { // We already checked this call's prototype; verify it doesn't modify errno. - if (!I.onlyReadsMemory()) + // Do not perform optimizations for call sites that require strict + // floating-point semantics. + if (!I.onlyReadsMemory() || I.isStrictFP()) return false; SDNodeFlags Flags; @@ -9509,11 +9513,10 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) { // Check for well-known libc/libm calls. If the function is internal, it // can't be a library call. Don't do the check if marked as nobuiltin for - // some reason or the call site requires strict floating point semantics. + // some reason. LibFunc Func; - if (!I.isNoBuiltin() && !I.isStrictFP() && !F->hasLocalLinkage() && - F->hasName() && LibInfo->getLibFunc(*F, Func) && - LibInfo->hasOptimizedCodeGen(Func)) { + if (!I.isNoBuiltin() && !F->hasLocalLinkage() && F->hasName() && + LibInfo->getLibFunc(*F, Func) && LibInfo->hasOptimizedCodeGen(Func)) { switch (Func) { default: break; case LibFunc_bcmp: diff --git a/llvm/test/CodeGen/PowerPC/milicode32.ll b/llvm/test/CodeGen/PowerPC/milicode32.ll index ddadd01a748f1..b69b997254d2c 100644 --- a/llvm/test/CodeGen/PowerPC/milicode32.ll +++ b/llvm/test/CodeGen/PowerPC/milicode32.ll @@ -68,7 +68,41 @@ entry: ret i32 %call } +define i32 @strlen_test_fp_strict(ptr noundef %str) nounwind { +; CHECK-AIX-32-P9-LABEL: strlen_test_fp_strict: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: mflr r0 +; CHECK-AIX-32-P9-NEXT: stwu r1, -64(r1) +; CHECK-AIX-32-P9-NEXT: stw r0, 72(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, 60(r1) +; CHECK-AIX-32-P9-NEXT: bl .___strlen[PR] +; CHECK-AIX-32-P9-NEXT: nop +; CHECK-AIX-32-P9-NEXT: addi r1, r1, 64 +; CHECK-AIX-32-P9-NEXT: lwz r0, 8(r1) +; CHECK-AIX-32-P9-NEXT: mtlr r0 +; CHECK-AIX-32-P9-NEXT: blr +; +; CHECK-LINUX32-P9-LABEL: strlen_test_fp_strict: +; CHECK-LINUX32-P9: # %bb.0: # %entry +; CHECK-LINUX32-P9-NEXT: mflr r0 +; CHECK-LINUX32-P9-NEXT: stwu r1, -16(r1) +; CHECK-LINUX32-P9-NEXT: stw r0, 20(r1) +; CHECK-LINUX32-P9-NEXT: stw r3, 12(r1) +; CHECK-LINUX32-P9-NEXT: bl strlen +; CHECK-LINUX32-P9-NEXT: lwz r0, 20(r1) +; CHECK-LINUX32-P9-NEXT: addi r1, r1, 16 +; CHECK-LINUX32-P9-NEXT: mtlr r0 +; CHECK-LINUX32-P9-NEXT: blr +entry: + %str.addr = alloca ptr, align 4 + store ptr %str, ptr %str.addr, align 4 + %0 = load ptr, ptr %str.addr, align 4 + %call = call i32 @strlen(ptr noundef %0) #0 + ret i32 %call +} + declare i32 @strlen(ptr noundef) nounwind +attributes #0 = { strictfp } define ptr @test_memmove(ptr noundef %destination, ptr noundef %source, i32 noundef %num) #0 { ; CHECK-AIX-32-P9-LABEL: test_memmove: From ee41ab3deae18ca25761cd86a0423338b0bbdd62 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 11 Nov 2025 10:39:31 -0800 Subject: [PATCH 20/64] [AMDGPU] Use MCRegister instead of unsigned. NFC (#167558) --- .../lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 09338c533fdf2..5e0486aa1dd49 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1865,7 +1865,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { unsigned getConstantBusLimit(unsigned Opcode) const; bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; - unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; + MCRegister findImplicitSGPRReadInVOP(const MCInst &Inst) const; bool isSupportedMnemo(StringRef Mnemo, const FeatureBitset &FBS); @@ -3665,7 +3665,8 @@ StringRef AMDGPUAsmParser::getMatchedVariantName() const { return ""; } -unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { +MCRegister +AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); for (MCPhysReg Reg : Desc.implicit_uses()) { switch (Reg) { @@ -3679,7 +3680,7 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { break; } } - return AMDGPU::NoRegister; + return MCRegister(); } // NB: This code is correct only when used to check constant @@ -3854,9 +3855,9 @@ bool AMDGPUAsmParser::validateConstantBusLimitations( LiteralSize = 4; } - SmallDenseSet SGPRsUsed; - unsigned SGPRUsed = findImplicitSGPRReadInVOP(Inst); - if (SGPRUsed != AMDGPU::NoRegister) { + SmallDenseSet SGPRsUsed; + MCRegister SGPRUsed = findImplicitSGPRReadInVOP(Inst); + if (SGPRUsed) { SGPRsUsed.insert(SGPRUsed); ++ConstantBusUseCount; } From 1eaff1924db93e561178490ff787f8ce1b52ed83 Mon Sep 17 00:00:00 2001 From: spupyrev Date: Tue, 11 Nov 2025 10:42:58 -0800 Subject: [PATCH 21/64] Revert "Adding Matching and Inference Functionality to Propeller-PR4: Implement matching and inference and create clusters" (#167559) Reverts llvm/llvm-project#165868 due to buildbot failures Co-authored-by: spupyrev --- .../CodeGen/BasicBlockMatchingAndInference.h | 62 ------ .../CodeGen/BasicBlockSectionsProfileReader.h | 7 - .../llvm/CodeGen/MachineBlockHashInfo.h | 2 - llvm/include/llvm/CodeGen/Passes.h | 4 - llvm/include/llvm/InitializePasses.h | 1 - .../Transforms/Utils/SampleProfileInference.h | 16 -- .../BasicBlockMatchingAndInference.cpp | 195 ------------------ llvm/lib/CodeGen/BasicBlockSections.cpp | 85 +------- .../BasicBlockSectionsProfileReader.cpp | 15 -- llvm/lib/CodeGen/CMakeLists.txt | 1 - llvm/lib/CodeGen/TargetPassConfig.cpp | 13 +- .../Utils/SampleProfileInference.cpp | 2 + .../basic-block-sections-clusters-bb-hash.ll | 99 --------- 13 files changed, 6 insertions(+), 496 deletions(-) delete mode 100644 llvm/include/llvm/CodeGen/BasicBlockMatchingAndInference.h delete mode 100644 llvm/lib/CodeGen/BasicBlockMatchingAndInference.cpp delete mode 100644 llvm/test/CodeGen/X86/basic-block-sections-clusters-bb-hash.ll diff --git a/llvm/include/llvm/CodeGen/BasicBlockMatchingAndInference.h b/llvm/include/llvm/CodeGen/BasicBlockMatchingAndInference.h deleted file mode 100644 index 6e9bbb969a445..0000000000000 --- a/llvm/include/llvm/CodeGen/BasicBlockMatchingAndInference.h +++ /dev/null @@ -1,62 +0,0 @@ -//===- llvm/CodeGen/BasicBlockMatchingAndInference.h ------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Infer weights for all basic blocks using matching and inference. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_CODEGEN_BASIC_BLOCK_AND_INFERENCE_H -#define LLVM_CODEGEN_BASIC_BLOCK_AND_INFERENCE_H - -#include "llvm/CodeGen/BasicBlockSectionsProfileReader.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/Transforms/Utils/SampleProfileInference.h" - -namespace llvm { - -class BasicBlockMatchingAndInference : public MachineFunctionPass { -private: - using Edge = std::pair; - using BlockWeightMap = DenseMap; - using EdgeWeightMap = DenseMap; - using BlockEdgeMap = DenseMap>; - - struct WeightInfo { - // Weight of basic blocks. - BlockWeightMap BlockWeights; - // Weight of edges. - EdgeWeightMap EdgeWeights; - }; - -public: - static char ID; - BasicBlockMatchingAndInference(); - - StringRef getPassName() const override { - return "Basic Block Matching and Inference"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override; - - bool runOnMachineFunction(MachineFunction &F) override; - - std::optional getWeightInfo(StringRef FuncName) const; - -private: - StringMap ProgramWeightInfo; - - WeightInfo initWeightInfoByMatching(MachineFunction &MF); - - void generateWeightInfoByInference(MachineFunction &MF, - WeightInfo &MatchWeight); -}; - -} // end namespace llvm - -#endif // LLVM_CODEGEN_BASIC_BLOCK_AND_INFERENCE_H diff --git a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h index f0d28d863282e..ee1f28377f7e4 100644 --- a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h +++ b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h @@ -86,10 +86,6 @@ class BasicBlockSectionsProfileReader { uint64_t getEdgeCount(StringRef FuncName, const UniqueBBID &SrcBBID, const UniqueBBID &SinkBBID) const; - // Return the complete function path and cluster info for the given function. - std::pair - getFunctionPathAndClusterInfo(StringRef FuncName) const; - private: StringRef getAliasName(StringRef FuncName) const { auto R = FuncAliasMap.find(FuncName); @@ -199,9 +195,6 @@ class BasicBlockSectionsProfileReaderWrapperPass : public ImmutablePass { uint64_t getEdgeCount(StringRef FuncName, const UniqueBBID &SrcBBID, const UniqueBBID &DestBBID) const; - std::pair - getFunctionPathAndClusterInfo(StringRef FuncName) const; - // Initializes the FunctionNameToDIFilename map for the current module and // then reads the profile for the matching functions. bool doInitialization(Module &M) override; diff --git a/llvm/include/llvm/CodeGen/MachineBlockHashInfo.h b/llvm/include/llvm/CodeGen/MachineBlockHashInfo.h index 6f26819d566ae..d044d5f940b75 100644 --- a/llvm/include/llvm/CodeGen/MachineBlockHashInfo.h +++ b/llvm/include/llvm/CodeGen/MachineBlockHashInfo.h @@ -80,8 +80,6 @@ struct BlendedBlockHash { return Dist; } - uint16_t getOpcodeHash() const { return OpcodeHash; } - private: /// The offset of the basic block from the function start. uint16_t Offset{0}; diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index 2bf83cfa655b6..a8525554b142e 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -69,10 +69,6 @@ LLVM_ABI MachineFunctionPass *createBasicBlockSectionsPass(); LLVM_ABI MachineFunctionPass *createBasicBlockPathCloningPass(); -/// createBasicBlockMatchingAndInferencePass - This pass enables matching -/// and inference when using propeller. -LLVM_ABI MachineFunctionPass *createBasicBlockMatchingAndInferencePass(); - /// createMachineBlockHashInfoPass - This pass computes basic block hashes. LLVM_ABI MachineFunctionPass *createMachineBlockHashInfoPass(); diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 18732caf78966..10a4d8525a9e8 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -55,7 +55,6 @@ LLVM_ABI void initializeAlwaysInlinerLegacyPassPass(PassRegistry &); LLVM_ABI void initializeAssignmentTrackingAnalysisPass(PassRegistry &); LLVM_ABI void initializeAssumptionCacheTrackerPass(PassRegistry &); LLVM_ABI void initializeAtomicExpandLegacyPass(PassRegistry &); -LLVM_ABI void initializeBasicBlockMatchingAndInferencePass(PassRegistry &); LLVM_ABI void initializeBasicBlockPathCloningPass(PassRegistry &); LLVM_ABI void initializeBasicBlockSectionsProfileReaderWrapperPassPass(PassRegistry &); diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h b/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h index e1663d29c1e3c..7231e45fe8eb7 100644 --- a/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h +++ b/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h @@ -130,11 +130,6 @@ template class SampleProfileInference { SampleProfileInference(FunctionT &F, BlockEdgeMap &Successors, BlockWeightMap &SampleBlockWeights) : F(F), Successors(Successors), SampleBlockWeights(SampleBlockWeights) {} - SampleProfileInference(FunctionT &F, BlockEdgeMap &Successors, - BlockWeightMap &SampleBlockWeights, - EdgeWeightMap &SampleEdgeWeights) - : F(F), Successors(Successors), SampleBlockWeights(SampleBlockWeights), - SampleEdgeWeights(SampleEdgeWeights) {} /// Apply the profile inference algorithm for a given function void apply(BlockWeightMap &BlockWeights, EdgeWeightMap &EdgeWeights); @@ -162,9 +157,6 @@ template class SampleProfileInference { /// Map basic blocks to their sampled weights. BlockWeightMap &SampleBlockWeights; - - /// Map edges to their sampled weights. - EdgeWeightMap SampleEdgeWeights; }; template @@ -274,14 +266,6 @@ FlowFunction SampleProfileInference::createFlowFunction( FlowJump Jump; Jump.Source = BlockIndex[BB]; Jump.Target = BlockIndex[Succ]; - auto It = SampleEdgeWeights.find(std::make_pair(BB, Succ)); - if (It != SampleEdgeWeights.end()) { - Jump.HasUnknownWeight = false; - Jump.Weight = It->second; - } else { - Jump.HasUnknownWeight = true; - Jump.Weight = 0; - } Func.Jumps.push_back(Jump); } } diff --git a/llvm/lib/CodeGen/BasicBlockMatchingAndInference.cpp b/llvm/lib/CodeGen/BasicBlockMatchingAndInference.cpp deleted file mode 100644 index 4fa90799f4e10..0000000000000 --- a/llvm/lib/CodeGen/BasicBlockMatchingAndInference.cpp +++ /dev/null @@ -1,195 +0,0 @@ -//===- llvm/CodeGen/BasicBlockMatchingAndInference.cpp ----------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// In Propeller's profile, we have already read the hash values of basic blocks, -// as well as the weights of basic blocks and edges in the CFG. In this file, -// we first match the basic blocks in the profile with those in the current -// MachineFunction using the basic block hash, thereby obtaining the weights of -// some basic blocks and edges. Subsequently, we infer the weights of all basic -// blocks using an inference algorithm. -// -// TODO: Integrate part of the code in this file with BOLT's implementation into -// the LLVM infrastructure, enabling both BOLT and Propeller to reuse it. -// -//===----------------------------------------------------------------------===// - -#include "llvm/CodeGen/BasicBlockMatchingAndInference.h" -#include "llvm/CodeGen/BasicBlockSectionsProfileReader.h" -#include "llvm/CodeGen/MachineBlockHashInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/InitializePasses.h" -#include - -using namespace llvm; - -static cl::opt - PropellerInferThreshold("propeller-infer-threshold", - cl::desc("Threshold for infer stale profile"), - cl::init(0.6), cl::Optional); - -/// The object is used to identify and match basic blocks given their hashes. -class StaleMatcher { -public: - /// Initialize stale matcher. - void init(const std::vector &Blocks, - const std::vector &Hashes) { - assert(Blocks.size() == Hashes.size() && - "incorrect matcher initialization"); - for (size_t I = 0; I < Blocks.size(); I++) { - MachineBasicBlock *Block = Blocks[I]; - uint16_t OpHash = Hashes[I].getOpcodeHash(); - OpHashToBlocks[OpHash].push_back(std::make_pair(Hashes[I], Block)); - } - } - - /// Find the most similar block for a given hash. - MachineBasicBlock *matchBlock(BlendedBlockHash BlendedHash) const { - auto BlockIt = OpHashToBlocks.find(BlendedHash.getOpcodeHash()); - if (BlockIt == OpHashToBlocks.end()) { - return nullptr; - } - MachineBasicBlock *BestBlock = nullptr; - uint64_t BestDist = std::numeric_limits::max(); - for (auto It : BlockIt->second) { - MachineBasicBlock *Block = It.second; - BlendedBlockHash Hash = It.first; - uint64_t Dist = Hash.distance(BlendedHash); - if (BestBlock == nullptr || Dist < BestDist) { - BestDist = Dist; - BestBlock = Block; - } - } - return BestBlock; - } - -private: - using HashBlockPairType = std::pair; - std::unordered_map> OpHashToBlocks; -}; - -INITIALIZE_PASS_BEGIN(BasicBlockMatchingAndInference, - "machine-block-match-infer", - "Machine Block Matching and Inference Analysis", true, - true) -INITIALIZE_PASS_DEPENDENCY(MachineBlockHashInfo) -INITIALIZE_PASS_DEPENDENCY(BasicBlockSectionsProfileReaderWrapperPass) -INITIALIZE_PASS_END(BasicBlockMatchingAndInference, "machine-block-match-infer", - "Machine Block Matching and Inference Analysis", true, true) - -char BasicBlockMatchingAndInference::ID = 0; - -BasicBlockMatchingAndInference::BasicBlockMatchingAndInference() - : MachineFunctionPass(ID) { - initializeBasicBlockMatchingAndInferencePass( - *PassRegistry::getPassRegistry()); -} - -void BasicBlockMatchingAndInference::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired(); - AU.addRequired(); - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); -} - -std::optional -BasicBlockMatchingAndInference::getWeightInfo(StringRef FuncName) const { - auto It = ProgramWeightInfo.find(FuncName); - if (It == ProgramWeightInfo.end()) { - return std::nullopt; - } - return It->second; -} - -BasicBlockMatchingAndInference::WeightInfo -BasicBlockMatchingAndInference::initWeightInfoByMatching(MachineFunction &MF) { - std::vector Blocks; - std::vector Hashes; - auto BSPR = &getAnalysis(); - auto MBHI = &getAnalysis(); - for (auto &Block : MF) { - Blocks.push_back(&Block); - Hashes.push_back(BlendedBlockHash(MBHI->getMBBHash(Block))); - } - StaleMatcher Matcher; - Matcher.init(Blocks, Hashes); - BasicBlockMatchingAndInference::WeightInfo MatchWeight; - auto [IsValid, PathAndClusterInfo] = - BSPR->getFunctionPathAndClusterInfo(MF.getName()); - if (!IsValid) - return MatchWeight; - for (auto &BlockCount : PathAndClusterInfo.NodeCounts) { - if (PathAndClusterInfo.BBHashes.count(BlockCount.first.BaseID)) { - auto Hash = PathAndClusterInfo.BBHashes[BlockCount.first.BaseID]; - MachineBasicBlock *Block = Matcher.matchBlock(BlendedBlockHash(Hash)); - // When a basic block has clone copies, sum their counts. - if (Block != nullptr) - MatchWeight.BlockWeights[Block] += BlockCount.second; - } - } - for (auto &PredItem : PathAndClusterInfo.EdgeCounts) { - auto PredID = PredItem.first.BaseID; - if (!PathAndClusterInfo.BBHashes.count(PredID)) - continue; - auto PredHash = PathAndClusterInfo.BBHashes[PredID]; - MachineBasicBlock *PredBlock = - Matcher.matchBlock(BlendedBlockHash(PredHash)); - if (PredBlock == nullptr) - continue; - for (auto &SuccItem : PredItem.second) { - auto SuccID = SuccItem.first.BaseID; - auto EdgeWeight = SuccItem.second; - if (PathAndClusterInfo.BBHashes.count(SuccID)) { - auto SuccHash = PathAndClusterInfo.BBHashes[SuccID]; - MachineBasicBlock *SuccBlock = - Matcher.matchBlock(BlendedBlockHash(SuccHash)); - // When an edge has clone copies, sum their counts. - if (SuccBlock != nullptr) - MatchWeight.EdgeWeights[std::make_pair(PredBlock, SuccBlock)] += - EdgeWeight; - } - } - } - return MatchWeight; -} - -void BasicBlockMatchingAndInference::generateWeightInfoByInference( - MachineFunction &MF, - BasicBlockMatchingAndInference::WeightInfo &MatchWeight) { - BlockEdgeMap Successors; - for (auto &Block : MF) { - for (auto *Succ : Block.successors()) - Successors[&Block].push_back(Succ); - } - SampleProfileInference SPI( - MF, Successors, MatchWeight.BlockWeights, MatchWeight.EdgeWeights); - BlockWeightMap BlockWeights; - EdgeWeightMap EdgeWeights; - SPI.apply(BlockWeights, EdgeWeights); - ProgramWeightInfo.try_emplace( - MF.getName(), BasicBlockMatchingAndInference::WeightInfo{ - std::move(BlockWeights), std::move(EdgeWeights)}); -} - -bool BasicBlockMatchingAndInference::runOnMachineFunction(MachineFunction &MF) { - if (MF.empty()) - return false; - auto MatchWeight = initWeightInfoByMatching(MF); - // If the ratio of the number of MBBs in matching to the total number of MBBs - // in the function is less than the threshold value, the processing should be - // abandoned. - if (static_cast(MatchWeight.BlockWeights.size()) / MF.size() < - PropellerInferThreshold) { - return false; - } - generateWeightInfoByInference(MF, MatchWeight); - return false; -} - -MachineFunctionPass *llvm::createBasicBlockMatchingAndInferencePass() { - return new BasicBlockMatchingAndInference(); -} diff --git a/llvm/lib/CodeGen/BasicBlockSections.cpp b/llvm/lib/CodeGen/BasicBlockSections.cpp index 87cd55be23194..52e2909bec072 100644 --- a/llvm/lib/CodeGen/BasicBlockSections.cpp +++ b/llvm/lib/CodeGen/BasicBlockSections.cpp @@ -70,7 +70,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" -#include "llvm/CodeGen/BasicBlockMatchingAndInference.h" #include "llvm/CodeGen/BasicBlockSectionUtils.h" #include "llvm/CodeGen/BasicBlockSectionsProfileReader.h" #include "llvm/CodeGen/MachineDominators.h" @@ -82,7 +81,6 @@ #include "llvm/InitializePasses.h" #include "llvm/Support/UniqueBBID.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Transforms/Utils/CodeLayout.h" #include using namespace llvm; @@ -177,79 +175,6 @@ updateBranches(MachineFunction &MF, } } -// This function generates the machine basic block clusters of "hot" blocks. -// Currently, only support one cluster creation. -// TODO: Support multi-cluster creation and path cloning. -static SmallVector -createBBClusterInfoForFunction(MachineFunction &MF, - BasicBlockMatchingAndInference *BMI) { - unsigned CurrentCluster = 0; - SmallVector BBClusterInfos; - auto OptWeightInfo = BMI->getWeightInfo(MF.getName()); - if (!OptWeightInfo) - return BBClusterInfos; - auto BlockWeights = OptWeightInfo->BlockWeights; - auto EdgeWeights = OptWeightInfo->EdgeWeights; - - SmallVector HotMBBs; - if (MF.size() <= 2) { - for (auto &MBB : MF) { - if (MBB.isEntryBlock() || BlockWeights[&MBB] > 0) { - HotMBBs.push_back(&MBB); - } - } - } else { - SmallVector BlockSizes(MF.size()); - SmallVector BlockCounts(MF.size()); - std::vector OrigOrder; - OrigOrder.reserve(MF.size()); - SmallVector JumpCounts; - - // Renumber blocks for running the layout algorithm. - MF.RenumberBlocks(); - - // Init the MBB size and count. - for (auto &MBB : MF) { - auto NonDbgInsts = - instructionsWithoutDebug(MBB.instr_begin(), MBB.instr_end()); - int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end()); - BlockSizes[MBB.getNumber()] = 4 * NumInsts; - BlockCounts[MBB.getNumber()] = BlockWeights[&MBB]; - OrigOrder.push_back(&MBB); - } - - // Init the edge count. - for (auto &MBB : MF) { - for (auto *Succ : MBB.successors()) { - auto EdgeWeight = EdgeWeights[std::make_pair(&MBB, Succ)]; - JumpCounts.push_back({static_cast(MBB.getNumber()), - static_cast(Succ->getNumber()), - EdgeWeight}); - } - } - - // Run the layout algorithm. - auto Result = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts); - for (uint64_t R : Result) { - auto Block = OrigOrder[R]; - if (Block->isEntryBlock() || BlockWeights[Block] > 0) - HotMBBs.push_back(Block); - } - } - - // Generate the "hot" basic block cluster. - if (!HotMBBs.empty()) { - unsigned CurrentPosition = 0; - for (auto &MBB : HotMBBs) { - if (MBB->getBBID()) { - BBClusterInfos.push_back( - {*(MBB->getBBID()), CurrentCluster, CurrentPosition++}); - } - } - } - return BBClusterInfos; -} - // This function sorts basic blocks according to the cluster's information. // All explicitly specified clusters of basic blocks will be ordered // accordingly. All non-specified BBs go into a separate "Cold" section. @@ -383,13 +308,8 @@ bool BasicBlockSections::handleBBSections(MachineFunction &MF) { DenseMap FuncClusterInfo; if (BBSectionsType == BasicBlockSection::List) { - SmallVector ClusterInfo; - if (auto *BMI = getAnalysisIfAvailable()) { - ClusterInfo = createBBClusterInfoForFunction(MF, BMI); - } else { - ClusterInfo = getAnalysis() - .getClusterInfoForFunction(MF.getName()); - } + auto ClusterInfo = getAnalysis() + .getClusterInfoForFunction(MF.getName()); if (ClusterInfo.empty()) return false; for (auto &BBClusterInfo : ClusterInfo) { @@ -479,7 +399,6 @@ bool BasicBlockSections::runOnMachineFunction(MachineFunction &MF) { void BasicBlockSections::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); AU.addRequired(); - AU.addUsedIfAvailable(); AU.addUsedIfAvailable(); AU.addUsedIfAvailable(); MachineFunctionPass::getAnalysisUsage(AU); diff --git a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp index be1c60c57ccf4..c234c0f1b0b34 100644 --- a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp +++ b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp @@ -93,15 +93,6 @@ uint64_t BasicBlockSectionsProfileReader::getEdgeCount( return EdgeIt->second; } -std::pair -BasicBlockSectionsProfileReader::getFunctionPathAndClusterInfo( - StringRef FuncName) const { - auto R = ProgramPathAndClusterInfo.find(getAliasName(FuncName)); - return R != ProgramPathAndClusterInfo.end() - ? std::pair(true, R->second) - : std::pair(false, FunctionPathAndClusterInfo()); -} - // Reads the version 1 basic block sections profile. Profile for each function // is encoded as follows: // m @@ -523,12 +514,6 @@ uint64_t BasicBlockSectionsProfileReaderWrapperPass::getEdgeCount( return BBSPR.getEdgeCount(FuncName, SrcBBID, SinkBBID); } -std::pair -BasicBlockSectionsProfileReaderWrapperPass::getFunctionPathAndClusterInfo( - StringRef FuncName) const { - return BBSPR.getFunctionPathAndClusterInfo(FuncName); -} - BasicBlockSectionsProfileReader & BasicBlockSectionsProfileReaderWrapperPass::getBBSPR() { return BBSPR; diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt index 30237e66ed0ec..1cf0b4964760b 100644 --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -35,7 +35,6 @@ add_llvm_component_library(LLVMCodeGen BasicBlockSections.cpp BasicBlockPathCloning.cpp BasicBlockSectionsProfileReader.cpp - BasicBlockMatchingAndInference.cpp CalcSpillWeights.cpp CallBrPrepare.cpp CallingConvLower.cpp diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index d94cc70da0ef0..10b723887b21f 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -272,12 +272,6 @@ static cl::opt cl::desc("Split static data sections into hot and cold " "sections using profile information")); -/// Enable matching and inference when using propeller. -static cl::opt PropellerMatchInfer( - "propeller-match-infer", - cl::desc("Enable matching and inference when using propeller"), - cl::init(false), cl::Optional); - cl::opt EmitBBHash( "emit-bb-hash", cl::desc( @@ -1293,15 +1287,12 @@ void TargetPassConfig::addMachinePasses() { // address map (or both). if (TM->getBBSectionsType() != llvm::BasicBlockSection::None || TM->Options.BBAddrMap) { - if (EmitBBHash || PropellerMatchInfer) + if (EmitBBHash) addPass(llvm::createMachineBlockHashInfoPass()); if (TM->getBBSectionsType() == llvm::BasicBlockSection::List) { addPass(llvm::createBasicBlockSectionsProfileReaderWrapperPass( TM->getBBSectionsFuncListBuf())); - if (PropellerMatchInfer) - addPass(llvm::createBasicBlockMatchingAndInferencePass()); - else - addPass(llvm::createBasicBlockPathCloningPass()); + addPass(llvm::createBasicBlockPathCloningPass()); } addPass(llvm::createBasicBlockSectionsPass()); } diff --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp index 934d1589c4a2e..53bcaa6d3df03 100644 --- a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp +++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp @@ -1174,6 +1174,8 @@ std::pair assignJumpCosts(const ProfiParams &Params, else CostInc = Params.CostJumpUnknownInc; CostDec = 0; + } else { + assert(Jump.Weight > 0 && "found zero-weight jump with a positive weight"); } return std::make_pair(CostInc, CostDec); } diff --git a/llvm/test/CodeGen/X86/basic-block-sections-clusters-bb-hash.ll b/llvm/test/CodeGen/X86/basic-block-sections-clusters-bb-hash.ll deleted file mode 100644 index 0ce3a522b932d..0000000000000 --- a/llvm/test/CodeGen/X86/basic-block-sections-clusters-bb-hash.ll +++ /dev/null @@ -1,99 +0,0 @@ -; BB cluster section tests when using edges profile and basic block hashes to generate clusters. -; -; Test1: Basic blocks #0 (entry), #1 and #3 will be placed in the same section. -; The rest will be placed in the cold section. -; -; RUN: llc %s -O0 -mtriple=x86_64-pc-linux -function-sections -filetype=obj -basic-block-address-map -emit-bb-hash -o %t.o -; -; RUN: echo 'v1' > %t1 -; RUN: echo 'f foo' >> %t1 -; RUN: echo 'g 0:100,1:100,2:0 1:100,3:100 2:0,3:0 3:100' >> %t1 -; -; These commands read BB hashes from SHT_LLVM_BB_ADDR_MAP -; and put them into the basic blocks sections profile. -; RUN: llvm-readobj %t.o --bb-addr-map | \ -; RUN: awk 'BEGIN {printf "h"} \ -; RUN: /ID: [0-9]+/ {id=$2} \ -; RUN: /Hash: 0x[0-9A-Fa-f]+/ {gsub(/^0x/, "", $2); hash=$2; printf " %%s:%%s", id, hash} \ -; RUN: END {print ""}' \ -; RUN: >> %t1 -; -; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t1 -propeller-match-infer | \ -; RUN: FileCheck %s -check-prefix=LINUX-SECTIONS1 -; -; Test2: Basic #0 (entry), #2 and #3 will be placed in the same section. -; The rest will be placed in the cold section. -; -; RUN: echo 'v1' > %t2 -; RUN: echo 'f foo' >> %t2 -; RUN: echo 'g 0:100,1:0,2:100 1:0,3:0 2:100,3:100 3:100' >> %t2 -; -; These commands read BB hashes from SHT_LLVM_BB_ADDR_MAP -; and put them into the basic blocks sections profile. -; RUN: llvm-readobj %t.o --bb-addr-map | \ -; RUN: awk 'BEGIN {printf "h"} \ -; RUN: /ID: [0-9]+/ {id=$2} \ -; RUN: /Hash: 0x[0-9A-Fa-f]+/ {gsub(/^0x/, "", $2); hash=$2; printf " %%s:%%s", id, hash} \ -; RUN: END {print ""}' \ -; RUN: >> %t2 -; -; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t2 -propeller-match-infer | \ -; RUN: FileCheck %s -check-prefix=LINUX-SECTIONS2 - -define void @foo(i1 zeroext) nounwind { - %2 = alloca i8, align 1 - %3 = zext i1 %0 to i8 - store i8 %3, ptr %2, align 1 - %4 = load i8, ptr %2, align 1 - %5 = trunc i8 %4 to i1 - br i1 %5, label %6, label %8 - -6: ; preds = %1 - %7 = call i32 @bar() - br label %10 - -8: ; preds = %1 - %9 = call i32 @baz() - br label %10 - -10: ; preds = %8, %6 - ret void -} - -declare i32 @bar() #1 - -declare i32 @baz() #1 - -; LINUX-SECTIONS1: .section .text.foo,"ax",@progbits -; LINUX-SECTIONS1-NOT: .section -; LINUX-SECTIONS1-LABEL: foo: -; LINUX-SECTIONS1-NOT: .section -; LINUX-SECTIONS1-NOT: .LBB_END0_{{0-9}}+ -; LINUX-SECTIONS1-LABEL: # %bb.1: -; LINUX-SECTIONS1-NOT: .section -; LINUX-SECTIONS1-NOT: .LBB_END0_{{0-9}}+ -; LINUX-SECTIONS1-LABEL: .LBB0_3: -; LINUX-SECTIONS1-LABEL: .LBB_END0_3: -; LINUX-SECTIONS1-NEXT: .section .text.split.foo,"ax",@progbits -; LINUX-SECTIONS1-LABEL: foo.cold: -; LINUX-SECTIONS1-LABEL: .LBB_END0_2: -; LINUX-SECTIONS1-NEXT: .size foo.cold, .LBB_END0_2-foo.cold -; LINUX-SECTIONS1-LABEL: .Lfunc_end0: -; LINUX-SECTIONS1-NEXT: .size foo, .Lfunc_end0-foo - -; LINUX-SECTIONS2: .section .text.foo,"ax",@progbits -; LINUX-SECTIONS2-NOT: .section -; LINUX-SECTIONS2-LABEL: foo: -; LINUX-SECTIONS2-NOT: .section -; LINUX-SECTIONS2-NOT: .LBB_END0_{{0-9}}+ -; LINUX-SECTIONS2-LABEL: # %bb.2: -; LINUX-SECTIONS2-NOT: .section -; LINUX-SECTIONS2-NOT: .LBB_END0_{{0-9}}+ -; LINUX-SECTIONS2-LABEL: .LBB0_3: -; LINUX-SECTIONS2-LABEL: .LBB_END0_3: -; LINUX-SECTIONS2-NEXT: .section .text.split.foo,"ax",@progbits -; LINUX-SECTIONS2-LABEL: foo.cold: -; LINUX-SECTIONS2-LABEL: .LBB_END0_1: -; LINUX-SECTIONS2-NEXT: .size foo.cold, .LBB_END0_1-foo.cold -; LINUX-SECTIONS2-LABEL: .Lfunc_end0: -; LINUX-SECTIONS2-NEXT: .size foo, .Lfunc_end0-foo From 843f1224cf205a5798183b08018a644bc1516478 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 11 Nov 2025 10:47:24 -0800 Subject: [PATCH 22/64] Triple: Add isMacOSVersionGE Triple utils (#167450) The existing function is LT but most of the uses are better expressed as GE --- llvm/include/llvm/TargetParser/Triple.h | 14 ++++++++++++++ llvm/unittests/TargetParser/TripleTest.cpp | 11 +++++++++++ 2 files changed, 25 insertions(+) diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h index 0e82dd212f34d..11b76cd183108 100644 --- a/llvm/include/llvm/TargetParser/Triple.h +++ b/llvm/include/llvm/TargetParser/Triple.h @@ -554,15 +554,29 @@ class Triple { return getOSVersion() < VersionTuple(Major, Minor, Micro); } + bool isOSVersionGE(unsigned Major, unsigned Minor = 0, + unsigned Micro = 0) const { + return !isOSVersionLT(Major, Minor, Micro); + } + bool isOSVersionLT(const Triple &Other) const { return getOSVersion() < Other.getOSVersion(); } + bool isOSVersionGE(const Triple &Other) const { + return getOSVersion() >= Other.getOSVersion(); + } + /// Comparison function for checking OS X version compatibility, which handles /// supporting skewed version numbering schemes used by the "darwin" triples. LLVM_ABI bool isMacOSXVersionLT(unsigned Major, unsigned Minor = 0, unsigned Micro = 0) const; + bool isMacOSXVersionGE(unsigned Major, unsigned Minor = 0, + unsigned Micro = 0) const { + return !isMacOSXVersionLT(Major, Minor, Micro); + } + /// Is this a Mac OS X triple. For legacy reasons, we support both "darwin" /// and "osx" as OS X triples. bool isMacOSX() const { diff --git a/llvm/unittests/TargetParser/TripleTest.cpp b/llvm/unittests/TargetParser/TripleTest.cpp index 3e803691cfd1f..df8284d7be66a 100644 --- a/llvm/unittests/TargetParser/TripleTest.cpp +++ b/llvm/unittests/TargetParser/TripleTest.cpp @@ -2630,6 +2630,17 @@ TEST(TripleTest, isMacOSVersionLT) { EXPECT_FALSE(T.isMacOSXVersionLT(10, 15, 0)); } +TEST(TripleTest, isMacOSVersionGE) { + Triple T = Triple("x86_64-apple-macos11"); + EXPECT_FALSE(T.isMacOSXVersionGE(11, 1, 0)); + EXPECT_TRUE(T.isMacOSXVersionGE(10, 15, 0)); + + T = Triple("x86_64-apple-darwin20"); + EXPECT_FALSE(T.isMacOSXVersionGE(11, 1, 0)); + EXPECT_TRUE(T.isMacOSXVersionGE(11, 0, 0)); + EXPECT_TRUE(T.isMacOSXVersionGE(10, 15, 0)); +} + TEST(TripleTest, CanonicalizeOSVersion) { EXPECT_EQ(VersionTuple(10, 15, 4), Triple::getCanonicalVersionForOS(Triple::MacOSX, From de68181d7f490f993c478019bc0ab299d3238296 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 11 Nov 2025 10:51:23 -0800 Subject: [PATCH 23/64] DAG: Use sincos vector libcalls through RuntimeLibcalls (#166984) Copy new process from sincospi. --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 7 ++++++- .../CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 14 ++++---------- llvm/lib/CodeGen/TargetLoweringBase.cpp | 18 ++++++++++++++++++ 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 1c167af4b0478..a52ad41d0f1b3 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -334,7 +334,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { break; case Intrinsic::sincos: - LC = RTLIB::getSINCOS(ScalarVT); + LC = RTLIB::getSINCOS(VT); + if (LC == RTLIB::UNKNOWN_LIBCALL) + LC = RTLIB::getSINCOS(ScalarVT); + else if (VT.isVector()) + IsVectorCall = true; + break; default: return std::nullopt; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index f5a54497c8a98..78d8ea0676dd7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -1268,10 +1268,12 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl &Results) { return; break; - + case ISD::FSINCOS: case ISD::FSINCOSPI: { EVT VT = Node->getValueType(0); - RTLIB::Libcall LC = RTLIB::getSINCOSPI(VT); + RTLIB::Libcall LC = Node->getOpcode() == ISD::FSINCOS + ? RTLIB::getSINCOS(VT) + : RTLIB::getSINCOSPI(VT); if (LC != RTLIB::UNKNOWN_LIBCALL && DAG.expandMultipleResultFPLibCall(LC, Node, Results, VT)) return; @@ -1280,14 +1282,6 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl &Results) { // scalarizing. break; } - case ISD::FSINCOS: { - // FIXME: Try to directly match vector case like fsincospi - EVT VT = Node->getValueType(0).getVectorElementType(); - RTLIB::Libcall LC = RTLIB::getSINCOS(VT); - if (DAG.expandMultipleResultFPLibCall(LC, Node, Results, VT)) - return; - break; - } case ISD::FMODF: { EVT VT = Node->getValueType(0).getVectorElementType(); RTLIB::Libcall LC = RTLIB::getMODF(VT); diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 01216552ed260..36a424f1c8b63 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -424,6 +424,24 @@ RTLIB::Libcall RTLIB::getCOS(EVT RetVT) { } RTLIB::Libcall RTLIB::getSINCOS(EVT RetVT) { + // TODO: Tablegen should generate this function + if (RetVT.isVector()) { + if (!RetVT.isSimple()) + return RTLIB::UNKNOWN_LIBCALL; + switch (RetVT.getSimpleVT().SimpleTy) { + case MVT::v4f32: + return RTLIB::SINCOS_V4F32; + case MVT::v2f64: + return RTLIB::SINCOS_V2F64; + case MVT::nxv4f32: + return RTLIB::SINCOS_NXV4F32; + case MVT::nxv2f64: + return RTLIB::SINCOS_NXV2F64; + default: + return RTLIB::UNKNOWN_LIBCALL; + } + } + return getFPLibCall(RetVT, SINCOS_F32, SINCOS_F64, SINCOS_F80, SINCOS_F128, SINCOS_PPCF128); } From 93b71e616288446041098cce4842e111cc054ed7 Mon Sep 17 00:00:00 2001 From: anoopkg6 Date: Tue, 11 Nov 2025 12:51:45 -0600 Subject: [PATCH 24/64] Llvm jitlink build failure (#167561) Fixed stub relocation test. Just need to check 32-bit. --------- Co-authored-by: anoopkg6 --- .../JITLink/systemz/ELF_systemz_reloc_call_pic.s | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_call_pic.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_call_pic.s index b753ec54b2361..743181655a5cc 100644 --- a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_call_pic.s +++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_call_pic.s @@ -61,8 +61,9 @@ test_call_extern_plt: # Check PLT stub relocation for lgrl(Delta32dbl). # # jitlink-check: *{4}(stub_addr(elf_pic_reloc.o, extern_out_of_range32) + 2) = \ -# jitlink-check: (got_addr(elf_pic_reloc.o, extern_out_of_range32) - \ -# jitlink-check: stub_addr(elf_pic_reloc.o, extern_out_of_range32)) >> 1 +# jitlink-check: ((got_addr(elf_pic_reloc.o, extern_out_of_range32) - \ +# jitlink-check: stub_addr(elf_pic_reloc.o, extern_out_of_range32)) >> 1) \ +# jitlink-check: & 0xffffffff .globl test_call_extern_plt_stub .p2align 4 .type test_call_extern_plt_stub,@function From ccb5145460711928765432e2cbabac8ad526f8d0 Mon Sep 17 00:00:00 2001 From: jimingham Date: Tue, 11 Nov 2025 11:06:40 -0800 Subject: [PATCH 25/64] Add a workaround for people that use *args instead of listing (#166883) parameters when defining the scripting interfaces. We try to count the parameters to make sure the user has defined them correctly, but this throws the counting off. I'm not adding a test for this because then it would seem like we thought this was a good idea. I'd actually rather not support it altogether, but we added the parameter checking pretty recently so there are extant implementations that we broke. I only want to support them, not suggest anyone else do this going forward. --- .../Python/Interfaces/ScriptedPythonInterface.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h index ec1dd9910d8a6..af88a69e34a13 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h @@ -188,8 +188,13 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { // This addresses the cases where the embedded interpreter session // dictionary is passed to the extension initializer which is not used // most of the time. + // Note, though none of our API's suggest defining the interfaces with + // varargs, we have some extant clients that were doing that. To keep + // from breaking them, we just say putting a varargs in these signatures + // turns off argument checking. size_t num_args = sizeof...(Args); - if (num_args != arg_info->max_positional_args) { + if (arg_info->max_positional_args != PythonCallable::ArgInfo::UNBOUNDED && + num_args != arg_info->max_positional_args) { if (num_args != arg_info->max_positional_args - 1) return create_error("Passed arguments ({0}) doesn't match the number " "of expected arguments ({1}).", From ea56ca2da3954d97b77fe44652d8728b0f50aa4c Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Tue, 11 Nov 2025 11:11:46 -0800 Subject: [PATCH 26/64] [CHR] Make Selects Created in MergedCondition have Unknown Profdata (#167534) These selects are dependent on values live into the CHRScope that we cannot infer anything about, so mark the branch weights unknown. These selects usually also just get folded down into a icmps, so the profile information ends up being kind of redundant. --- .../ControlHeightReduction.cpp | 2 + .../Transforms/PGOProfile/chr-lifetimes.ll | 68 ++++++++++++------- llvm/utils/profcheck-xfail.txt | 4 -- 3 files changed, 47 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp index 0688bc7ac08eb..726d94b27a7f2 100644 --- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp +++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp @@ -1992,6 +1992,8 @@ void CHR::addToMergedCondition(bool IsTrueBiased, Value *Cond, // Use logical and to avoid propagating poison from later conditions. MergedCondition = IRB.CreateLogicalAnd(MergedCondition, Cond); + setExplicitlyUnknownBranchWeightsIfProfiled( + *cast(MergedCondition), DEBUG_TYPE); } void CHR::transformScopes(SmallVectorImpl &CHRScopes) { diff --git a/llvm/test/Transforms/PGOProfile/chr-lifetimes.ll b/llvm/test/Transforms/PGOProfile/chr-lifetimes.ll index b29834f9fe960..6e543b8c87fc7 100644 --- a/llvm/test/Transforms/PGOProfile/chr-lifetimes.ll +++ b/llvm/test/Transforms/PGOProfile/chr-lifetimes.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals ; RUN: opt < %s -passes='require,chr' -S | FileCheck %s declare void @foo() @@ -14,21 +14,21 @@ define void @test_chr_with_lifetimes(ptr %i) !prof !14 { ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = freeze i1 [[TMP1]] -; CHECK-NEXT: [[TMP10:%.*]] = select i1 true, i1 [[TMP9]], i1 false +; CHECK-NEXT: [[TMP8:%.*]] = select i1 true, i1 [[TMP9]], i1 false, !prof [[PROF15:![0-9]+]] ; CHECK-NEXT: [[TMP11:%.*]] = freeze i1 [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP10]], i1 [[TMP11]], i1 false -; CHECK-NEXT: br i1 [[TMP5]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF15:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP8]], i1 [[TMP11]], i1 false, !prof [[PROF15]] +; CHECK-NEXT: br i1 [[TMP5]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF16:![0-9]+]] ; CHECK: entry.split: -; CHECK-NEXT: [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF16:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF17:![0-9]+]] ; CHECK-NEXT: call void @baz(i64 [[TMP6]]) -; CHECK-NEXT: br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF17:![0-9]+]] +; CHECK-NEXT: br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF18:![0-9]+]] ; CHECK: bb0: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label [[BB1]] ; CHECK: entry.split.nonchr: -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF16]] +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF17]] ; CHECK-NEXT: call void @baz(i64 [[TMP7]]) -; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF16]] +; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF17]] ; CHECK: bb0.nonchr: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label [[BB1]] @@ -83,24 +83,24 @@ define void @test_chr_dynamic_alloca(ptr %i) !prof !14 { ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = freeze i1 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = select i1 true, i1 [[TMP2]], i1 false +; CHECK-NEXT: [[TMP3:%.*]] = select i1 true, i1 [[TMP2]], i1 false, !prof [[PROF15]] ; CHECK-NEXT: [[TMP4:%.*]] = freeze i1 [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false -; CHECK-NEXT: br i1 [[TMP5]], label [[BB4_SPLIT:%.*]], label [[BB4_SPLIT_NONCHR:%.*]], !prof [[PROF15]] +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false, !prof [[PROF15]] +; CHECK-NEXT: br i1 [[TMP5]], label [[BB4_SPLIT:%.*]], label [[BB4_SPLIT_NONCHR:%.*]], !prof [[PROF16]] ; CHECK: bb4.split: -; CHECK-NEXT: [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF16]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF17]] ; CHECK-NEXT: [[TEST:%.*]] = alloca i32, align 8 ; CHECK-NEXT: call void @baz(i64 [[TMP6]]) -; CHECK-NEXT: br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF17]] +; CHECK-NEXT: br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF18]] ; CHECK: bb0: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: store ptr [[TEST]], ptr [[I]], align 8 ; CHECK-NEXT: br label [[BB1]] ; CHECK: bb4.split.nonchr: -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF16]] +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF17]] ; CHECK-NEXT: [[TEST_NONCHR:%.*]] = alloca i32, align 8 ; CHECK-NEXT: call void @baz(i64 [[TMP7]]) -; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF16]] +; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF17]] ; CHECK: bb0.nonchr: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: store ptr [[TEST_NONCHR]], ptr [[I]], align 8 @@ -167,21 +167,21 @@ define void @test_no_move_allocas(ptr %i) !prof !14 { ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = freeze i1 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = select i1 true, i1 [[TMP2]], i1 false +; CHECK-NEXT: [[TMP3:%.*]] = select i1 true, i1 [[TMP2]], i1 false, !prof [[PROF15]] ; CHECK-NEXT: [[TMP4:%.*]] = freeze i1 [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false -; CHECK-NEXT: br i1 [[TMP5]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF15]] +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false, !prof [[PROF15]] +; CHECK-NEXT: br i1 [[TMP5]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF16]] ; CHECK: entry.split: -; CHECK-NEXT: [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF16]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF17]] ; CHECK-NEXT: call void @baz(i64 [[TMP6]]) -; CHECK-NEXT: br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF17]] +; CHECK-NEXT: br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF18]] ; CHECK: bb0: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label [[BB1]] ; CHECK: entry.split.nonchr: -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF16]] +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF17]] ; CHECK-NEXT: call void @baz(i64 [[TMP7]]) -; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF16]] +; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF17]] ; CHECK: bb0.nonchr: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label [[BB1]] @@ -242,4 +242,26 @@ bb3: !14 = !{!"function_entry_count", i64 100} !15 = !{!"branch_weights", i32 0, i32 1} -; CHECK: !15 = !{!"branch_weights", i32 1000, i32 0} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +;. +; CHECK: [[META0:![0-9]+]] = !{i32 1, !"ProfileSummary", [[META1:![0-9]+]]} +; CHECK: [[META1]] = !{[[META2:![0-9]+]], [[META3:![0-9]+]], [[META4:![0-9]+]], [[META5:![0-9]+]], [[META6:![0-9]+]], [[META7:![0-9]+]], [[META8:![0-9]+]], [[META9:![0-9]+]]} +; CHECK: [[META2]] = !{!"ProfileFormat", !"InstrProf"} +; CHECK: [[META3]] = !{!"TotalCount", i64 10000} +; CHECK: [[META4]] = !{!"MaxCount", i64 10} +; CHECK: [[META5]] = !{!"MaxInternalCount", i64 1} +; CHECK: [[META6]] = !{!"MaxFunctionCount", i64 1000} +; CHECK: [[META7]] = !{!"NumCounts", i64 3} +; CHECK: [[META8]] = !{!"NumFunctions", i64 3} +; CHECK: [[META9]] = !{!"DetailedSummary", [[META10:![0-9]+]]} +; CHECK: [[META10]] = !{[[META11:![0-9]+]], [[META12:![0-9]+]], [[META13:![0-9]+]]} +; CHECK: [[META11]] = !{i32 10000, i64 100, i32 1} +; CHECK: [[META12]] = !{i32 999000, i64 100, i32 1} +; CHECK: [[META13]] = !{i32 999999, i64 1, i32 2} +; CHECK: [[META14:![0-9]+]] = !{!"function_entry_count", i64 100} +; CHECK: [[PROF15]] = !{!"unknown", !"chr"} +; CHECK: [[PROF16]] = !{!"branch_weights", i32 1000, i32 0} +; CHECK: [[PROF17]] = !{!"branch_weights", i32 1, i32 0} +; CHECK: [[PROF18]] = !{!"branch_weights", i32 0, i32 1} +;. diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt index 15f2c79784d1e..10a7b62229b8d 100644 --- a/llvm/utils/profcheck-xfail.txt +++ b/llvm/utils/profcheck-xfail.txt @@ -608,10 +608,6 @@ Transforms/OpenMP/spmdization.ll Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll Transforms/OpenMP/spmdization_remarks.ll Transforms/PartiallyInlineLibCalls/X86/good-prototype.ll -Transforms/PGOProfile/chr-dead-pred.ll -Transforms/PGOProfile/chr-dup-threshold.ll -Transforms/PGOProfile/chr-lifetimes.ll -Transforms/PGOProfile/chr-poison.ll Transforms/PGOProfile/comdat.ll Transforms/PGOProfile/memop_profile_funclet_wasm.ll Transforms/PGOProfile/X86/macho.ll From b8add3710dee22a85aaf9c3dcde8c1da1f728b6f Mon Sep 17 00:00:00 2001 From: Akash Dutta <137309513+akadutta@users.noreply.github.com> Date: Tue, 11 Nov 2025 13:14:48 -0600 Subject: [PATCH 27/64] [AMDGPU] Add pattern to select scalar ops for fshr with uniform operands (#165295) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reasoning behind proposed change. This helps us move away from selecting v_alignbits for fshr with uniform operands. V_ALIGNBIT is defined in the ISA as: D0.u32 = 32'U(({ S0.u32, S1.u32 } >> S2.u32[4 : 0]) & 0xffffffffLL) Note: S0 carries the MSBs and S1 carries the LSBs of the value being aligned. I interpret that as : concat (s0, s1) >> S2, and use the 0X1F mask to return the lower 32 bits. fshr: fshr i32 %src0, i32 %src1, i32 %src2 Where: concat(%src0, %src1) represents the 64-bit value formed by %src0 as the high 32 bits and %src1 as the low 32 bits. %src2 is the shift amount. Only the lower 32 bits are returned. So these two are identical. So, I can expand the V_ALIGNBIT through bit manipulation as: Concat: S1 | (S0 << 32) Shift: ((S1 | (S0 << 32)) >> S2) Break the shift: (S1>>S2) | (S0 << (32 – S2) The proposed pattern does exactly this. Additionally, src2 in the fshr pattern should be: * must be 0–31. * If the shift is ≥32, hardware semantics differ; you must handle it with extra instructions. The extra S_ANDs limit the selection only to the last 5 bits --- llvm/lib/Target/AMDGPU/SIInstructions.td | 11 + .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 24690 ++++++++-------- .../CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll | 1336 +- .../CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll | 2627 +- .../CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll | 216 +- .../CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll | 92 +- .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 7210 ++--- .../CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll | 662 +- .../CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll | 675 +- .../CodeGen/AMDGPU/any_extend_vector_inreg.ll | 30 +- llvm/test/CodeGen/AMDGPU/bf16.ll | 84 +- llvm/test/CodeGen/AMDGPU/build_vector.ll | 4 +- .../AMDGPU/divergence-driven-buildvector.ll | 6 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 82 +- llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll | 46 +- llvm/test/CodeGen/AMDGPU/fneg.bf16.ll | 62 +- llvm/test/CodeGen/AMDGPU/fshl.ll | 668 +- llvm/test/CodeGen/AMDGPU/fshr.ll | 1327 +- .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 64 +- llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 804 +- llvm/test/CodeGen/AMDGPU/load-global-i8.ll | 863 +- llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll | 8 +- 22 files changed, 22167 insertions(+), 19400 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 6f1feb1dc2996..6dd4b1d7bd000 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -791,6 +791,17 @@ def : GCNPat< (SI_CALL_ISEL $src0, (i64 0)) >; +// Funnel shift right (fshr) patterns for uniform inputs. +// These patterns implement this using scalar instructions by constructing a 64-bit +// value {a, b} and performing a single right shift. +def : GCNPat<(UniformTernaryFrag i32:$src0, i32:$src1, i32:$src2), + (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), (S_AND_B32 $src2, (i32 31))), sub0)) +>; + +def : GCNPat<(UniformTernaryFrag i32:$src0, i32:$src1, (i32 ShiftAmt32Imm:$src2)), + (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), $src2), sub0)) +>; + // Wrapper around s_swappc_b64 with extra $callee parameter to track // the called function after regalloc. def SI_CALL : SPseudoInstSI < diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 08e64da632d3b..9b329b338d090 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -23697,8 +23697,17 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: s_mov_b32 s72, s74 ; SI-NEXT: s_mov_b32 s73, s75 ; SI-NEXT: s_mov_b32 s74, s76 -; SI-NEXT: v_readlane_b32 s75, v21, 0 -; SI-NEXT: v_readlane_b32 s76, v21, 1 +; SI-NEXT: s_mov_b32 s75, s77 +; SI-NEXT: s_mov_b32 s76, s78 +; SI-NEXT: s_mov_b32 s77, s79 +; SI-NEXT: s_mov_b32 s78, s88 +; SI-NEXT: s_mov_b32 s79, s89 +; SI-NEXT: s_mov_b32 s88, s90 +; SI-NEXT: s_mov_b32 s89, s91 +; SI-NEXT: s_mov_b32 s90, s92 +; SI-NEXT: s_mov_b32 s91, s93 +; SI-NEXT: v_readlane_b32 s92, v21, 0 +; SI-NEXT: v_readlane_b32 s93, v21, 1 ; SI-NEXT: s_cbranch_vccnz .LBB17_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -23760,16 +23769,16 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: s_lshl_b32 s62, s84, 16 ; SI-NEXT: s_and_b32 s73, s83, 0xffff0000 ; SI-NEXT: s_lshl_b32 s72, s83, 16 -; SI-NEXT: s_and_b32 s77, s82, 0xffff0000 +; SI-NEXT: s_and_b32 s75, s82, 0xffff0000 ; SI-NEXT: s_lshl_b32 s74, s82, 16 -; SI-NEXT: s_and_b32 s79, s81, 0xffff0000 -; SI-NEXT: s_lshl_b32 s78, s81, 16 -; SI-NEXT: s_and_b32 s89, s80, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s80, 16 -; SI-NEXT: s_and_b32 s91, s71, 0xffff0000 -; SI-NEXT: s_lshl_b32 s90, s71, 16 -; SI-NEXT: s_and_b32 s93, s70, 0xffff0000 -; SI-NEXT: s_lshl_b32 s92, s70, 16 +; SI-NEXT: s_and_b32 s77, s81, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s81, 16 +; SI-NEXT: s_and_b32 s79, s80, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s80, 16 +; SI-NEXT: s_and_b32 s89, s71, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s71, 16 +; SI-NEXT: s_and_b32 s91, s70, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s70, 16 ; SI-NEXT: s_and_b32 s95, s29, 0xffff0000 ; SI-NEXT: s_lshl_b32 s94, s29, 16 ; SI-NEXT: s_and_b32 s31, s28, 0xffff0000 @@ -23794,8 +23803,8 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: s_lshl_b32 s66, s19, 16 ; SI-NEXT: s_and_b32 s69, s18, 0xffff0000 ; SI-NEXT: s_lshl_b32 s68, s18, 16 -; SI-NEXT: s_and_b32 s76, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s75, s17, 16 +; SI-NEXT: s_and_b32 s93, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s17, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v21, s6, 2 ; SI-NEXT: s_lshl_b32 s6, s16, 16 @@ -23804,228 +23813,228 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: v_readlane_b32 s6, v21, 2 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 ; SI-NEXT: v_readlane_b32 s6, v21, 3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_readlane_b32 s99, v20, 35 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s75 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s68 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s66 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s54 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s52 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s48 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s36 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s34 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s30 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s94 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s92 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s62 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s58 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s56 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s46 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s44 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s44 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s42 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s42 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s41 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s40 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s14 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s14 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s12 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s12 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s10 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s10 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s99, v20, 35 ; SI-NEXT: v_readlane_b32 s98, v20, 34 ; SI-NEXT: v_readlane_b32 s97, v20, 33 ; SI-NEXT: v_readlane_b32 s96, v20, 32 @@ -27302,562 +27311,737 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v30 -; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: v_mov_b32_e32 v40, v12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mov_b32_e32 v55, v14 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_mov_b32_e32 v43, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v23 +; SI-NEXT: v_mov_b32_e32 v29, v20 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v28 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v60 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s27 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v36 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s24 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s28 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v14, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v54, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v46, 16 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_alignbit_b32 v13, v13, v47, 16 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_alignbit_b32 v4, v4, v34, 16 -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_alignbit_b32 v16, v16, v43, 16 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_alignbit_b32 v17, v17, v41, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v40, 16 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_alignbit_b32 v19, v19, v55, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_alignbit_b32 v21, v21, v53, 16 -; SI-NEXT: v_alignbit_b32 v22, v22, v52, 16 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v23, v23, v51, 16 -; SI-NEXT: v_alignbit_b32 v24, v24, v50, 16 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_alignbit_b32 v26, v26, v39, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_alignbit_b32 v27, v27, v38, 16 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v32 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_alignbit_b32 v31, v31, v42, 16 +; SI-NEXT: v_mov_b32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v57 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_alignbit_b32 v10, v10, v61, 16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_alignbit_b32 v12, v12, v54, 16 -; SI-NEXT: v_mov_b32_e32 v41, v61 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v1, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 +; SI-NEXT: v_mov_b32_e32 v2, v14 +; SI-NEXT: v_mov_b32_e32 v49, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 +; SI-NEXT: v_mov_b32_e32 v20, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v4, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[4:5], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 +; SI-NEXT: v_mov_b32_e32 v6, v45 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[6:7], v[45:46], 16 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[7:8], v[39:40], 16 +; SI-NEXT: v_mov_b32_e32 v8, v9 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_mov_b32_e32 v9, v54 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[54:55], 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_mov_b32_e32 v10, v11 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[10:11], v[11:12], 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_mov_b32_e32 v11, v56 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[11:12], v[56:57], 16 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 +; SI-NEXT: v_mov_b32_e32 v13, v58 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 +; SI-NEXT: v_mov_b32_e32 v14, v60 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[14:15], v[60:61], 16 +; SI-NEXT: v_mov_b32_e32 v15, v62 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[15:16], v[62:63], 16 +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v11, v11, v59, 16 -; SI-NEXT: v_mov_b32_e32 v55, v59 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v14, v14, v45, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_mov_b32_e32 v40, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[47:48], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 16 +; SI-NEXT: v_mov_b32_e32 v20, v38 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[38:39], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_mov_b32_e32 v21, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[21:22], v[22:23], 16 +; SI-NEXT: v_mov_b32_e32 v22, v31 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[22:23], v[31:32], 16 +; SI-NEXT: v_mov_b32_e32 v23, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[23:24], v[24:25], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 +; SI-NEXT: v_mov_b32_e32 v24, v41 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[24:25], v[41:42], 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v50 +; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[26:27], 16 +; SI-NEXT: v_mov_b32_e32 v26, v43 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[26:27], v[43:44], 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[51:52], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[53:54], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v15, v15, v47, 16 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v57 +; SI-NEXT: v_mov_b32_e32 v53, v31 +; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 ; SI-NEXT: s_branch .LBB19_3 ; SI-NEXT: .LBB19_2: -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v0 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v51 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB19_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v38, v50 -; SI-NEXT: v_mov_b32_e32 v39, v52 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v35, v56 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v32, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v54, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v58, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_cbranch_vccnz .LBB19_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v45 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v41 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 +; SI-NEXT: v_mov_b32_e32 v31, v32 ; SI-NEXT: .LBB19_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -27885,36 +28069,39 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 ; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 ; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB19_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB19_3 @@ -27923,580 +28110,600 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v15, v18, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[33:34] +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[35:36] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_lshrrev_b64 v[35:36], 16, v[35:36] +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[36:37] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[36:37] +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[37:38] +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_lshrrev_b64 v[37:38], 16, v[37:38] +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[38:39] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[48:49] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[48:49] +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[49:50] +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[50:51] +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[50:51], 16, v[50:51] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[51:52] +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; VI-NEXT: v_lshrrev_b64 v[51:52], 16, v[51:52] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[52:53] +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; VI-NEXT: v_lshrrev_b64 v[52:53], 16, v[52:53] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[53:54] +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; VI-NEXT: v_lshrrev_b64 v[53:54], 16, v[53:54] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[54:55] +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[38:39] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[39:40] +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; VI-NEXT: v_lshrrev_b64 v[39:40], 16, v[39:40] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[40:41] +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; VI-NEXT: v_lshrrev_b64 v[40:41], 16, v[40:41] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v18, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v31, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[41:42] +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v15, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b64 v[54:55], 16, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[41:42], 16, v[41:42] +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[31:32] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[15:16] +; VI-NEXT: v_mov_b32_e32 v1, v50 +; VI-NEXT: v_mov_b32_e32 v3, v49 +; VI-NEXT: v_mov_b32_e32 v5, v48 +; VI-NEXT: v_mov_b32_e32 v7, v38 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v11, v36 +; VI-NEXT: v_mov_b32_e32 v13, v35 +; VI-NEXT: v_mov_b32_e32 v15, v34 +; VI-NEXT: v_mov_b32_e32 v17, v41 +; VI-NEXT: v_mov_b32_e32 v19, v18 +; VI-NEXT: v_mov_b32_e32 v21, v40 +; VI-NEXT: v_mov_b32_e32 v23, v39 +; VI-NEXT: v_mov_b32_e32 v25, v54 +; VI-NEXT: v_mov_b32_e32 v27, v53 +; VI-NEXT: v_mov_b32_e32 v29, v52 +; VI-NEXT: v_mov_b32_e32 v31, v51 ; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB19_4: ; VI-NEXT: s_branch .LBB19_2 @@ -61886,214 +62093,213 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v61 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v59 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v58 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v56 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v47 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v43 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v41 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v53 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v51 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v49 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 ; SI-NEXT: v_readlane_b32 s97, v63, 33 @@ -62130,22 +62336,23 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: v_readlane_b32 s34, v63, 2 ; SI-NEXT: v_readlane_b32 s31, v63, 1 ; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -65386,562 +65593,737 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v30 -; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: v_mov_b32_e32 v40, v12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mov_b32_e32 v55, v14 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_mov_b32_e32 v43, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v23 +; SI-NEXT: v_mov_b32_e32 v29, v20 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v28 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v60 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s27 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v36 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s24 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s28 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB43_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v14, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v54, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v46, 16 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_alignbit_b32 v13, v13, v47, 16 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_alignbit_b32 v4, v4, v34, 16 -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_alignbit_b32 v16, v16, v43, 16 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_alignbit_b32 v17, v17, v41, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v40, 16 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_alignbit_b32 v19, v19, v55, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_alignbit_b32 v21, v21, v53, 16 -; SI-NEXT: v_alignbit_b32 v22, v22, v52, 16 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v23, v23, v51, 16 -; SI-NEXT: v_alignbit_b32 v24, v24, v50, 16 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_alignbit_b32 v26, v26, v39, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_alignbit_b32 v27, v27, v38, 16 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v32 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_alignbit_b32 v31, v31, v42, 16 +; SI-NEXT: v_mov_b32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v57 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_alignbit_b32 v10, v10, v61, 16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_alignbit_b32 v12, v12, v54, 16 -; SI-NEXT: v_mov_b32_e32 v41, v61 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v1, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 +; SI-NEXT: v_mov_b32_e32 v2, v14 +; SI-NEXT: v_mov_b32_e32 v49, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 +; SI-NEXT: v_mov_b32_e32 v20, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v4, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[4:5], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 +; SI-NEXT: v_mov_b32_e32 v6, v45 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[6:7], v[45:46], 16 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[7:8], v[39:40], 16 +; SI-NEXT: v_mov_b32_e32 v8, v9 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_mov_b32_e32 v9, v54 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[54:55], 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_mov_b32_e32 v10, v11 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[10:11], v[11:12], 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_mov_b32_e32 v11, v56 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[11:12], v[56:57], 16 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 +; SI-NEXT: v_mov_b32_e32 v13, v58 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 +; SI-NEXT: v_mov_b32_e32 v14, v60 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[14:15], v[60:61], 16 +; SI-NEXT: v_mov_b32_e32 v15, v62 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[15:16], v[62:63], 16 +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v11, v11, v59, 16 -; SI-NEXT: v_mov_b32_e32 v55, v59 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v14, v14, v45, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_mov_b32_e32 v40, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[47:48], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 16 +; SI-NEXT: v_mov_b32_e32 v20, v38 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[38:39], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_mov_b32_e32 v21, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[21:22], v[22:23], 16 +; SI-NEXT: v_mov_b32_e32 v22, v31 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[22:23], v[31:32], 16 +; SI-NEXT: v_mov_b32_e32 v23, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[23:24], v[24:25], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 +; SI-NEXT: v_mov_b32_e32 v24, v41 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[24:25], v[41:42], 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v50 +; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[26:27], 16 +; SI-NEXT: v_mov_b32_e32 v26, v43 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[26:27], v[43:44], 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[51:52], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[53:54], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v53 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v15, v15, v47, 16 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v57 +; SI-NEXT: v_mov_b32_e32 v53, v31 +; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 ; SI-NEXT: s_branch .LBB43_3 ; SI-NEXT: .LBB43_2: -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v0 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v51 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB43_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v38, v50 -; SI-NEXT: v_mov_b32_e32 v39, v52 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v35, v56 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v32, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v54, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v58, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_cbranch_vccnz .LBB43_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v45 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v41 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 +; SI-NEXT: v_mov_b32_e32 v31, v32 ; SI-NEXT: .LBB43_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -65969,36 +66351,39 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 ; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 ; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB43_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB43_3 @@ -66007,580 +66392,600 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v15, v18, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[33:34] +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[35:36] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_lshrrev_b64 v[35:36], 16, v[35:36] +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[36:37] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[36:37] +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[37:38] +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_lshrrev_b64 v[37:38], 16, v[37:38] +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[38:39] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[48:49] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[48:49] +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[49:50] +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[50:51] +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[50:51], 16, v[50:51] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[51:52] +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; VI-NEXT: v_lshrrev_b64 v[51:52], 16, v[51:52] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[52:53] +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; VI-NEXT: v_lshrrev_b64 v[52:53], 16, v[52:53] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[53:54] +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; VI-NEXT: v_lshrrev_b64 v[53:54], 16, v[53:54] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[54:55] +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[38:39] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[39:40] +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; VI-NEXT: v_lshrrev_b64 v[39:40], 16, v[39:40] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[40:41] +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; VI-NEXT: v_lshrrev_b64 v[40:41], 16, v[40:41] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v18, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v31, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[41:42] +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v15, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b64 v[54:55], 16, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[41:42], 16, v[41:42] +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[31:32] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[15:16] +; VI-NEXT: v_mov_b32_e32 v1, v50 +; VI-NEXT: v_mov_b32_e32 v3, v49 +; VI-NEXT: v_mov_b32_e32 v5, v48 +; VI-NEXT: v_mov_b32_e32 v7, v38 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v11, v36 +; VI-NEXT: v_mov_b32_e32 v13, v35 +; VI-NEXT: v_mov_b32_e32 v15, v34 +; VI-NEXT: v_mov_b32_e32 v17, v41 +; VI-NEXT: v_mov_b32_e32 v19, v18 +; VI-NEXT: v_mov_b32_e32 v21, v40 +; VI-NEXT: v_mov_b32_e32 v23, v39 +; VI-NEXT: v_mov_b32_e32 v25, v54 +; VI-NEXT: v_mov_b32_e32 v27, v53 +; VI-NEXT: v_mov_b32_e32 v29, v52 +; VI-NEXT: v_mov_b32_e32 v31, v51 ; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB43_4: ; VI-NEXT: s_branch .LBB43_2 @@ -97627,229 +98032,229 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a ; SI-NEXT: v_writelane_b32 v21, s8, 3 ; SI-NEXT: .LBB61_3: ; %end ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s68 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_readlane_b32 s4, v21, 2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s66 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s54 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s52 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s48 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s36 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s34 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s30 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s94 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s92 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s62 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s58 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s56 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s46 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s44 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s44 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s42 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s42 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s41 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s40 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s14 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s14 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s12 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s12 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s10 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s10 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: v_readlane_b32 s4, v21, 2 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_readlane_b32 s4, v21, 3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: v_readlane_b32 s4, v21, 0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_readlane_b32 s4, v21, 1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: v_readlane_b32 s99, v20, 35 @@ -101207,562 +101612,737 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v30 -; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: v_mov_b32_e32 v40, v12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mov_b32_e32 v55, v14 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_mov_b32_e32 v43, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v23 +; SI-NEXT: v_mov_b32_e32 v29, v20 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v28 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v60 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s27 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v36 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s24 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s28 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB63_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v14, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v54, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v46, 16 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_alignbit_b32 v13, v13, v47, 16 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_alignbit_b32 v4, v4, v34, 16 -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_alignbit_b32 v16, v16, v43, 16 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_alignbit_b32 v17, v17, v41, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v40, 16 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_alignbit_b32 v19, v19, v55, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_alignbit_b32 v21, v21, v53, 16 -; SI-NEXT: v_alignbit_b32 v22, v22, v52, 16 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v23, v23, v51, 16 -; SI-NEXT: v_alignbit_b32 v24, v24, v50, 16 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_alignbit_b32 v26, v26, v39, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_alignbit_b32 v27, v27, v38, 16 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v32 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_alignbit_b32 v31, v31, v42, 16 +; SI-NEXT: v_mov_b32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v57 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_alignbit_b32 v10, v10, v61, 16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_alignbit_b32 v12, v12, v54, 16 -; SI-NEXT: v_mov_b32_e32 v41, v61 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v1, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 +; SI-NEXT: v_mov_b32_e32 v2, v14 +; SI-NEXT: v_mov_b32_e32 v49, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 +; SI-NEXT: v_mov_b32_e32 v20, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v4, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[4:5], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 +; SI-NEXT: v_mov_b32_e32 v6, v45 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[6:7], v[45:46], 16 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[7:8], v[39:40], 16 +; SI-NEXT: v_mov_b32_e32 v8, v9 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_mov_b32_e32 v9, v54 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[54:55], 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_mov_b32_e32 v10, v11 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[10:11], v[11:12], 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_mov_b32_e32 v11, v56 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[11:12], v[56:57], 16 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 +; SI-NEXT: v_mov_b32_e32 v13, v58 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 +; SI-NEXT: v_mov_b32_e32 v14, v60 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[14:15], v[60:61], 16 +; SI-NEXT: v_mov_b32_e32 v15, v62 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[15:16], v[62:63], 16 +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v11, v11, v59, 16 -; SI-NEXT: v_mov_b32_e32 v55, v59 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v14, v14, v45, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_mov_b32_e32 v40, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[47:48], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 16 +; SI-NEXT: v_mov_b32_e32 v20, v38 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[38:39], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_mov_b32_e32 v21, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[21:22], v[22:23], 16 +; SI-NEXT: v_mov_b32_e32 v22, v31 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[22:23], v[31:32], 16 +; SI-NEXT: v_mov_b32_e32 v23, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[23:24], v[24:25], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 +; SI-NEXT: v_mov_b32_e32 v24, v41 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[24:25], v[41:42], 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v50 +; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[26:27], 16 +; SI-NEXT: v_mov_b32_e32 v26, v43 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[26:27], v[43:44], 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[51:52], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[53:54], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v15, v15, v47, 16 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v57 +; SI-NEXT: v_mov_b32_e32 v53, v31 +; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 ; SI-NEXT: s_branch .LBB63_3 ; SI-NEXT: .LBB63_2: -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v0 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v51 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB63_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v38, v50 -; SI-NEXT: v_mov_b32_e32 v39, v52 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v35, v56 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v32, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v54, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v58, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_cbranch_vccnz .LBB63_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v45 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v41 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 +; SI-NEXT: v_mov_b32_e32 v31, v32 ; SI-NEXT: .LBB63_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -101790,36 +102370,39 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 ; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 ; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB63_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB63_3 @@ -101828,580 +102411,600 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v15, v18, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[33:34] +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[35:36] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_lshrrev_b64 v[35:36], 16, v[35:36] +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[36:37] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[36:37] +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[37:38] +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_lshrrev_b64 v[37:38], 16, v[37:38] +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[38:39] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[48:49] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[48:49] +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[49:50] +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[50:51] +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[50:51], 16, v[50:51] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[51:52] +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; VI-NEXT: v_lshrrev_b64 v[51:52], 16, v[51:52] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[52:53] +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; VI-NEXT: v_lshrrev_b64 v[52:53], 16, v[52:53] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[53:54] +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; VI-NEXT: v_lshrrev_b64 v[53:54], 16, v[53:54] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[54:55] +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[38:39] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[39:40] +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; VI-NEXT: v_lshrrev_b64 v[39:40], 16, v[39:40] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[40:41] +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; VI-NEXT: v_lshrrev_b64 v[40:41], 16, v[40:41] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v18, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v31, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[41:42] +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v15, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b64 v[54:55], 16, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[41:42], 16, v[41:42] +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[31:32] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[15:16] +; VI-NEXT: v_mov_b32_e32 v1, v50 +; VI-NEXT: v_mov_b32_e32 v3, v49 +; VI-NEXT: v_mov_b32_e32 v5, v48 +; VI-NEXT: v_mov_b32_e32 v7, v38 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v11, v36 +; VI-NEXT: v_mov_b32_e32 v13, v35 +; VI-NEXT: v_mov_b32_e32 v15, v34 +; VI-NEXT: v_mov_b32_e32 v17, v41 +; VI-NEXT: v_mov_b32_e32 v19, v18 +; VI-NEXT: v_mov_b32_e32 v21, v40 +; VI-NEXT: v_mov_b32_e32 v23, v39 +; VI-NEXT: v_mov_b32_e32 v25, v54 +; VI-NEXT: v_mov_b32_e32 v27, v53 +; VI-NEXT: v_mov_b32_e32 v29, v52 +; VI-NEXT: v_mov_b32_e32 v31, v51 ; VI-NEXT: .LBB63_3: ; %end +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB63_4: ; VI-NEXT: s_branch .LBB63_2 @@ -133460,94 +134063,92 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_writelane_b32 v62, s46, 3 ; SI-NEXT: s_cbranch_execnz .LBB77_4 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[35:36], s[44:45], 1.0 ; SI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 -; SI-NEXT: v_add_f64 v[49:50], s[28:29], 1.0 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v35 ; SI-NEXT: v_add_f64 v[1:2], s[22:23], 1.0 ; SI-NEXT: v_add_f64 v[41:42], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[27:28], s[40:41], 1.0 -; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_add_f64 v[31:32], s[42:43], 1.0 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v16 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v28 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v42 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v42 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v41 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v42 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v42 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v41 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 ; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v2 ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f64 v[2:3], s[20:21], 1.0 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[4:5], 1.0 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v32 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v31 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v31 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v3 -; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f64 v[51:52], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[49:50], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[35:36], s[44:45], 1.0 +; SI-NEXT: v_add_f64 v[31:32], s[42:43], 1.0 +; SI-NEXT: v_add_f64 v[27:28], s[40:41], 1.0 ; SI-NEXT: v_add_f64 v[23:24], s[14:15], 1.0 ; SI-NEXT: v_add_f64 v[19:20], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[4:5], 1.0 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 ; SI-NEXT: v_add_f64 v[59:60], s[18:19], 1.0 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v8 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v20 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v19 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v36 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v36 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v50 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v49 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v36 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v50 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v50 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v49 ; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v52 ; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v52 ; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v51 ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v2 -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v60 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v60 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v1 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v60 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v60 ; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v59 ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_branch .LBB77_5 @@ -133622,15 +134223,18 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: ; kill: killed $sgpr46 ; SI-NEXT: s_branch .LBB77_2 ; SI-NEXT: .LBB77_4: -; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v1, s59 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s36 -; SI-NEXT: v_readlane_b32 s4, v62, 0 +; SI-NEXT: v_mov_b32_e32 v1, s58 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s34 -; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: v_mov_b32_e32 v1, s57 +; SI-NEXT: v_readlane_b32 s4, v62, 0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s56 +; SI-NEXT: v_mov_b32_e32 v61, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -133638,328 +134242,329 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_readlane_b32 s4, v62, 2 ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 3 -; SI-NEXT: v_mov_b32_e32 v5, s59 -; SI-NEXT: v_mov_b32_e32 v4, s58 -; SI-NEXT: v_mov_b32_e32 v9, s57 -; SI-NEXT: v_mov_b32_e32 v6, s56 -; SI-NEXT: v_mov_b32_e32 v13, s99 -; SI-NEXT: v_mov_b32_e32 v10, s98 -; SI-NEXT: v_mov_b32_e32 v17, s97 -; SI-NEXT: v_mov_b32_e32 v14, s96 -; SI-NEXT: v_mov_b32_e32 v21, s87 -; SI-NEXT: v_mov_b32_e32 v18, s86 -; SI-NEXT: v_mov_b32_e32 v25, s85 -; SI-NEXT: v_mov_b32_e32 v22, s84 -; SI-NEXT: v_mov_b32_e32 v29, s83 -; SI-NEXT: v_mov_b32_e32 v26, s82 -; SI-NEXT: v_mov_b32_e32 v33, s81 -; SI-NEXT: v_mov_b32_e32 v30, s80 -; SI-NEXT: v_mov_b32_e32 v37, s71 -; SI-NEXT: v_mov_b32_e32 v34, s70 -; SI-NEXT: v_mov_b32_e32 v39, s69 -; SI-NEXT: v_mov_b32_e32 v38, s68 -; SI-NEXT: v_mov_b32_e32 v53, s67 -; SI-NEXT: v_mov_b32_e32 v48, s66 -; SI-NEXT: v_mov_b32_e32 v55, s65 -; SI-NEXT: v_mov_b32_e32 v54, s64 -; SI-NEXT: v_mov_b32_e32 v43, s55 -; SI-NEXT: v_mov_b32_e32 v40, s54 -; SI-NEXT: v_mov_b32_e32 v45, s53 -; SI-NEXT: v_mov_b32_e32 v44, s52 -; SI-NEXT: v_mov_b32_e32 v47, s51 -; SI-NEXT: v_mov_b32_e32 v46, s50 -; SI-NEXT: v_mov_b32_e32 v57, s49 -; SI-NEXT: v_mov_b32_e32 v56, s48 -; SI-NEXT: v_mov_b32_e32 v61, s39 -; SI-NEXT: v_mov_b32_e32 v58, s38 -; SI-NEXT: v_mov_b32_e32 v8, s35 -; SI-NEXT: v_mov_b32_e32 v24, s31 -; SI-NEXT: v_mov_b32_e32 v23, s30 +; SI-NEXT: v_mov_b32_e32 v6, s99 +; SI-NEXT: v_mov_b32_e32 v5, s98 +; SI-NEXT: v_mov_b32_e32 v8, s97 +; SI-NEXT: v_mov_b32_e32 v7, s96 +; SI-NEXT: v_mov_b32_e32 v10, s87 +; SI-NEXT: v_mov_b32_e32 v9, s86 +; SI-NEXT: v_mov_b32_e32 v12, s85 +; SI-NEXT: v_mov_b32_e32 v11, s84 +; SI-NEXT: v_mov_b32_e32 v14, s83 +; SI-NEXT: v_mov_b32_e32 v13, s82 +; SI-NEXT: v_mov_b32_e32 v16, s81 +; SI-NEXT: v_mov_b32_e32 v15, s80 +; SI-NEXT: v_mov_b32_e32 v18, s71 +; SI-NEXT: v_mov_b32_e32 v17, s70 +; SI-NEXT: v_mov_b32_e32 v20, s69 +; SI-NEXT: v_mov_b32_e32 v19, s68 +; SI-NEXT: v_mov_b32_e32 v22, s67 +; SI-NEXT: v_mov_b32_e32 v21, s66 +; SI-NEXT: v_mov_b32_e32 v24, s65 +; SI-NEXT: v_mov_b32_e32 v23, s64 +; SI-NEXT: v_mov_b32_e32 v26, s55 +; SI-NEXT: v_mov_b32_e32 v25, s54 +; SI-NEXT: v_mov_b32_e32 v28, s53 +; SI-NEXT: v_mov_b32_e32 v27, s52 +; SI-NEXT: v_mov_b32_e32 v30, s51 +; SI-NEXT: v_mov_b32_e32 v29, s50 +; SI-NEXT: v_mov_b32_e32 v32, s49 +; SI-NEXT: v_mov_b32_e32 v31, s48 +; SI-NEXT: v_mov_b32_e32 v34, s39 +; SI-NEXT: v_mov_b32_e32 v33, s38 +; SI-NEXT: v_mov_b32_e32 v36, s37 +; SI-NEXT: v_mov_b32_e32 v35, s36 +; SI-NEXT: v_mov_b32_e32 v38, s35 +; SI-NEXT: v_mov_b32_e32 v37, s34 +; SI-NEXT: v_mov_b32_e32 v48, s31 +; SI-NEXT: v_mov_b32_e32 v39, s30 ; SI-NEXT: v_mov_b32_e32 v50, s95 ; SI-NEXT: v_mov_b32_e32 v49, s94 ; SI-NEXT: v_mov_b32_e32 v52, s93 ; SI-NEXT: v_mov_b32_e32 v51, s92 -; SI-NEXT: v_mov_b32_e32 v16, s91 -; SI-NEXT: v_mov_b32_e32 v15, s90 -; SI-NEXT: v_mov_b32_e32 v28, s89 -; SI-NEXT: v_mov_b32_e32 v27, s88 +; SI-NEXT: v_mov_b32_e32 v54, s91 +; SI-NEXT: v_mov_b32_e32 v53, s90 +; SI-NEXT: v_mov_b32_e32 v40, s89 +; SI-NEXT: v_mov_b32_e32 v55, s88 ; SI-NEXT: v_mov_b32_e32 v42, s79 ; SI-NEXT: v_mov_b32_e32 v41, s78 -; SI-NEXT: v_mov_b32_e32 v11, s77 -; SI-NEXT: v_mov_b32_e32 v12, s76 -; SI-NEXT: v_mov_b32_e32 v32, s75 -; SI-NEXT: v_mov_b32_e32 v31, s74 -; SI-NEXT: v_mov_b32_e32 v19, s73 -; SI-NEXT: v_mov_b32_e32 v20, s72 -; SI-NEXT: v_mov_b32_e32 v36, s63 -; SI-NEXT: v_mov_b32_e32 v35, s62 +; SI-NEXT: v_mov_b32_e32 v43, s77 +; SI-NEXT: v_mov_b32_e32 v44, s76 +; SI-NEXT: v_mov_b32_e32 v46, s75 +; SI-NEXT: v_mov_b32_e32 v45, s74 +; SI-NEXT: v_mov_b32_e32 v47, s73 +; SI-NEXT: v_mov_b32_e32 v56, s72 +; SI-NEXT: v_mov_b32_e32 v58, s63 +; SI-NEXT: v_mov_b32_e32 v57, s62 ; SI-NEXT: v_mov_b32_e32 v60, s61 ; SI-NEXT: v_mov_b32_e32 v59, s60 ; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: .LBB77_5: ; %end ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v61 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v59 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v58 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v20 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v56 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v41 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v51 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v49 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_readlane_b32 s99, v63, 35 -; SI-NEXT: v_readlane_b32 s98, v63, 34 -; SI-NEXT: v_readlane_b32 s97, v63, 33 -; SI-NEXT: v_readlane_b32 s96, v63, 32 -; SI-NEXT: v_readlane_b32 s87, v63, 31 -; SI-NEXT: v_readlane_b32 s86, v63, 30 -; SI-NEXT: v_readlane_b32 s85, v63, 29 -; SI-NEXT: v_readlane_b32 s84, v63, 28 -; SI-NEXT: v_readlane_b32 s83, v63, 27 -; SI-NEXT: v_readlane_b32 s82, v63, 26 -; SI-NEXT: v_readlane_b32 s81, v63, 25 -; SI-NEXT: v_readlane_b32 s80, v63, 24 -; SI-NEXT: v_readlane_b32 s71, v63, 23 -; SI-NEXT: v_readlane_b32 s70, v63, 22 -; SI-NEXT: v_readlane_b32 s69, v63, 21 -; SI-NEXT: v_readlane_b32 s68, v63, 20 -; SI-NEXT: v_readlane_b32 s67, v63, 19 -; SI-NEXT: v_readlane_b32 s66, v63, 18 -; SI-NEXT: v_readlane_b32 s65, v63, 17 -; SI-NEXT: v_readlane_b32 s64, v63, 16 -; SI-NEXT: v_readlane_b32 s55, v63, 15 -; SI-NEXT: v_readlane_b32 s54, v63, 14 -; SI-NEXT: v_readlane_b32 s53, v63, 13 -; SI-NEXT: v_readlane_b32 s52, v63, 12 -; SI-NEXT: v_readlane_b32 s51, v63, 11 -; SI-NEXT: v_readlane_b32 s50, v63, 10 -; SI-NEXT: v_readlane_b32 s49, v63, 9 -; SI-NEXT: v_readlane_b32 s48, v63, 8 -; SI-NEXT: v_readlane_b32 s39, v63, 7 -; SI-NEXT: v_readlane_b32 s38, v63, 6 -; SI-NEXT: v_readlane_b32 s37, v63, 5 -; SI-NEXT: v_readlane_b32 s36, v63, 4 -; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: v_readlane_b32 s97, v63, 33 +; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s84, v63, 28 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s70, v63, 22 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -137170,562 +137775,737 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v30 -; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: v_mov_b32_e32 v40, v12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mov_b32_e32 v55, v14 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_mov_b32_e32 v43, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v23 +; SI-NEXT: v_mov_b32_e32 v29, v20 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v28 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v60 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s27 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v36 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s24 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s28 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB79_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v14, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v54, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v46, 16 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_alignbit_b32 v13, v13, v47, 16 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_alignbit_b32 v4, v4, v34, 16 -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_alignbit_b32 v16, v16, v43, 16 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_alignbit_b32 v17, v17, v41, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v40, 16 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_alignbit_b32 v19, v19, v55, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_alignbit_b32 v21, v21, v53, 16 -; SI-NEXT: v_alignbit_b32 v22, v22, v52, 16 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v23, v23, v51, 16 -; SI-NEXT: v_alignbit_b32 v24, v24, v50, 16 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_alignbit_b32 v26, v26, v39, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_alignbit_b32 v27, v27, v38, 16 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v32 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_alignbit_b32 v31, v31, v42, 16 +; SI-NEXT: v_mov_b32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v57 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_alignbit_b32 v10, v10, v61, 16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_alignbit_b32 v12, v12, v54, 16 -; SI-NEXT: v_mov_b32_e32 v41, v61 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v1, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 +; SI-NEXT: v_mov_b32_e32 v2, v14 +; SI-NEXT: v_mov_b32_e32 v49, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 +; SI-NEXT: v_mov_b32_e32 v20, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v4, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[4:5], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 +; SI-NEXT: v_mov_b32_e32 v6, v45 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[6:7], v[45:46], 16 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[7:8], v[39:40], 16 +; SI-NEXT: v_mov_b32_e32 v8, v9 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_mov_b32_e32 v9, v54 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[54:55], 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_mov_b32_e32 v10, v11 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[10:11], v[11:12], 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_mov_b32_e32 v11, v56 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[11:12], v[56:57], 16 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 +; SI-NEXT: v_mov_b32_e32 v13, v58 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 +; SI-NEXT: v_mov_b32_e32 v14, v60 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[14:15], v[60:61], 16 +; SI-NEXT: v_mov_b32_e32 v15, v62 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[15:16], v[62:63], 16 +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v11, v11, v59, 16 -; SI-NEXT: v_mov_b32_e32 v55, v59 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v14, v14, v45, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_mov_b32_e32 v40, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[47:48], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 16 +; SI-NEXT: v_mov_b32_e32 v20, v38 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[38:39], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_mov_b32_e32 v21, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[21:22], v[22:23], 16 +; SI-NEXT: v_mov_b32_e32 v22, v31 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[22:23], v[31:32], 16 +; SI-NEXT: v_mov_b32_e32 v23, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[23:24], v[24:25], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 +; SI-NEXT: v_mov_b32_e32 v24, v41 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[24:25], v[41:42], 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v50 +; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[26:27], 16 +; SI-NEXT: v_mov_b32_e32 v26, v43 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[26:27], v[43:44], 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[51:52], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[53:54], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v15, v15, v47, 16 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v57 +; SI-NEXT: v_mov_b32_e32 v53, v31 +; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 ; SI-NEXT: s_branch .LBB79_3 ; SI-NEXT: .LBB79_2: -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v0 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v51 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB79_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v38, v50 -; SI-NEXT: v_mov_b32_e32 v39, v52 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v35, v56 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v32, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v54, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v58, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_cbranch_vccnz .LBB79_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v45 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v41 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 +; SI-NEXT: v_mov_b32_e32 v31, v32 ; SI-NEXT: .LBB79_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -137753,36 +138533,39 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 ; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 ; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB79_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB79_3 @@ -137791,580 +138574,600 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v15, v18, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[33:34] +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[35:36] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_lshrrev_b64 v[35:36], 16, v[35:36] +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[36:37] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[36:37] +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[37:38] +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_lshrrev_b64 v[37:38], 16, v[37:38] +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[38:39] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[48:49] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[48:49] +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[49:50] +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[50:51] +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[50:51], 16, v[50:51] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[51:52] +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; VI-NEXT: v_lshrrev_b64 v[51:52], 16, v[51:52] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[52:53] +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; VI-NEXT: v_lshrrev_b64 v[52:53], 16, v[52:53] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[53:54] +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; VI-NEXT: v_lshrrev_b64 v[53:54], 16, v[53:54] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[54:55] +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[38:39] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[39:40] +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; VI-NEXT: v_lshrrev_b64 v[39:40], 16, v[39:40] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[40:41] +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; VI-NEXT: v_lshrrev_b64 v[40:41], 16, v[40:41] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v18, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v31, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[41:42] +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v15, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b64 v[54:55], 16, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[41:42], 16, v[41:42] +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[31:32] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[15:16] +; VI-NEXT: v_mov_b32_e32 v1, v50 +; VI-NEXT: v_mov_b32_e32 v3, v49 +; VI-NEXT: v_mov_b32_e32 v5, v48 +; VI-NEXT: v_mov_b32_e32 v7, v38 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v11, v36 +; VI-NEXT: v_mov_b32_e32 v13, v35 +; VI-NEXT: v_mov_b32_e32 v15, v34 +; VI-NEXT: v_mov_b32_e32 v17, v41 +; VI-NEXT: v_mov_b32_e32 v19, v18 +; VI-NEXT: v_mov_b32_e32 v21, v40 +; VI-NEXT: v_mov_b32_e32 v23, v39 +; VI-NEXT: v_mov_b32_e32 v25, v54 +; VI-NEXT: v_mov_b32_e32 v27, v53 +; VI-NEXT: v_mov_b32_e32 v29, v52 +; VI-NEXT: v_mov_b32_e32 v31, v51 ; VI-NEXT: .LBB79_3: ; %end +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB79_4: ; VI-NEXT: s_branch .LBB79_2 @@ -153903,6 +154706,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328 @@ -153912,14 +154716,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 -; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane -; SI-NEXT: s_mov_b32 s72, s21 +; SI-NEXT: ; implicit-def: $vgpr44 : SGPR spill to VGPR lane +; SI-NEXT: s_mov_b32 s73, s21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v43, s19, 0 -; SI-NEXT: v_writelane_b32 v43, s18, 1 -; SI-NEXT: v_writelane_b32 v43, s17, 2 -; SI-NEXT: v_writelane_b32 v43, s16, 3 -; SI-NEXT: s_mov_b32 s60, s24 +; SI-NEXT: v_writelane_b32 v44, s19, 0 +; SI-NEXT: v_writelane_b32 v44, s18, 1 +; SI-NEXT: v_writelane_b32 v44, s17, 2 +; SI-NEXT: v_writelane_b32 v44, s16, 3 ; SI-NEXT: v_writelane_b32 v41, s30, 0 ; SI-NEXT: v_writelane_b32 v41, s31, 1 ; SI-NEXT: v_writelane_b32 v41, s34, 2 @@ -153944,7 +154747,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v41, s69, 21 ; SI-NEXT: v_writelane_b32 v41, s70, 22 ; SI-NEXT: v_writelane_b32 v41, s71, 23 -; SI-NEXT: s_mov_b32 s77, s28 +; SI-NEXT: s_mov_b32 s74, s29 +; SI-NEXT: s_mov_b32 s78, s28 ; SI-NEXT: s_mov_b32 s76, s27 ; SI-NEXT: v_writelane_b32 v41, s80, 24 ; SI-NEXT: v_writelane_b32 v41, s81, 25 @@ -153955,7 +154759,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v41, s86, 30 ; SI-NEXT: v_writelane_b32 v41, s87, 31 ; SI-NEXT: v_writelane_b32 v41, s96, 32 -; SI-NEXT: s_mov_b32 s79, s26 +; SI-NEXT: s_mov_b32 s47, s26 ; SI-NEXT: v_writelane_b32 v41, s97, 33 ; SI-NEXT: v_writelane_b32 v41, s98, 34 ; SI-NEXT: v_writelane_b32 v41, s99, 35 @@ -153965,32 +154769,32 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 +; SI-NEXT: v_readfirstlane_b32 s37, v22 +; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane ; SI-NEXT: v_readfirstlane_b32 s38, v20 -; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v43, s37, 0 ; SI-NEXT: v_readfirstlane_b32 s39, v19 -; SI-NEXT: v_writelane_b32 v42, s38, 0 +; SI-NEXT: v_writelane_b32 v43, s38, 1 ; SI-NEXT: v_readfirstlane_b32 s48, v25 -; SI-NEXT: v_writelane_b32 v42, s39, 1 +; SI-NEXT: v_writelane_b32 v43, s39, 2 ; SI-NEXT: v_readfirstlane_b32 s49, v26 -; SI-NEXT: v_writelane_b32 v42, s48, 2 +; SI-NEXT: v_writelane_b32 v43, s48, 3 ; SI-NEXT: v_readfirstlane_b32 s50, v24 -; SI-NEXT: v_writelane_b32 v42, s49, 3 +; SI-NEXT: v_writelane_b32 v43, s49, 4 ; SI-NEXT: v_readfirstlane_b32 s51, v23 -; SI-NEXT: v_writelane_b32 v42, s50, 4 +; SI-NEXT: v_writelane_b32 v43, s50, 5 ; SI-NEXT: v_readfirstlane_b32 s52, v29 -; SI-NEXT: v_writelane_b32 v42, s51, 5 +; SI-NEXT: v_writelane_b32 v43, s51, 6 ; SI-NEXT: v_readfirstlane_b32 s53, v30 -; SI-NEXT: v_writelane_b32 v42, s52, 6 +; SI-NEXT: v_writelane_b32 v43, s52, 7 ; SI-NEXT: v_readfirstlane_b32 s54, v28 -; SI-NEXT: v_writelane_b32 v42, s53, 7 +; SI-NEXT: v_writelane_b32 v43, s53, 8 ; SI-NEXT: v_readfirstlane_b32 s55, v27 -; SI-NEXT: v_writelane_b32 v42, s54, 8 -; SI-NEXT: v_writelane_b32 v42, s55, 9 +; SI-NEXT: v_writelane_b32 v43, s54, 9 +; SI-NEXT: v_writelane_b32 v43, s55, 10 +; SI-NEXT: s_mov_b32 s57, s24 ; SI-NEXT: v_readfirstlane_b32 s16, v1 ; SI-NEXT: v_readfirstlane_b32 s17, v2 -; SI-NEXT: v_readfirstlane_b32 s18, v5 -; SI-NEXT: v_readfirstlane_b32 s19, v6 -; SI-NEXT: v_readfirstlane_b32 s88, v4 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s6, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 @@ -154001,27 +154805,30 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v32 -; SI-NEXT: v_writelane_b32 v43, s4, 4 +; SI-NEXT: v_writelane_b32 v44, s4, 4 ; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 -; SI-NEXT: v_writelane_b32 v43, s4, 5 +; SI-NEXT: v_writelane_b32 v44, s4, 5 ; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: v_writelane_b32 v43, s4, 6 +; SI-NEXT: v_writelane_b32 v44, s4, 6 ; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: v_writelane_b32 v43, s4, 7 +; SI-NEXT: v_writelane_b32 v44, s4, 7 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: v_writelane_b32 v43, s4, 8 +; SI-NEXT: v_writelane_b32 v44, s4, 8 ; SI-NEXT: v_readfirstlane_b32 s4, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256 -; SI-NEXT: v_writelane_b32 v43, s4, 9 +; SI-NEXT: v_writelane_b32 v44, s4, 9 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v43, s4, 10 +; SI-NEXT: v_writelane_b32 v44, s4, 10 +; SI-NEXT: v_readfirstlane_b32 s18, v5 +; SI-NEXT: v_readfirstlane_b32 s19, v6 +; SI-NEXT: v_readfirstlane_b32 s77, v4 ; SI-NEXT: v_readfirstlane_b32 s89, v3 ; SI-NEXT: v_readfirstlane_b32 s90, v9 ; SI-NEXT: v_readfirstlane_b32 s91, v10 @@ -154034,22 +154841,23 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readfirstlane_b32 s34, v16 ; SI-NEXT: v_readfirstlane_b32 s35, v15 ; SI-NEXT: v_readfirstlane_b32 s36, v21 -; SI-NEXT: v_readfirstlane_b32 s37, v22 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s24, v40 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v43, s4, 11 +; SI-NEXT: v_writelane_b32 v44, s4, 11 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v43, s4, 12 +; SI-NEXT: v_writelane_b32 v44, s4, 12 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v48 -; SI-NEXT: v_writelane_b32 v43, s4, 13 +; SI-NEXT: v_writelane_b32 v44, s4, 13 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v43, s4, 14 +; SI-NEXT: v_writelane_b32 v44, s4, 14 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v43, s4, 15 +; SI-NEXT: v_writelane_b32 v44, s4, 15 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 @@ -154062,33 +154870,33 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s75, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s61, v33 +; SI-NEXT: v_readfirstlane_b32 s21, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 -; SI-NEXT: v_writelane_b32 v43, s4, 16 +; SI-NEXT: v_writelane_b32 v44, s4, 16 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s43, v34 +; SI-NEXT: v_readfirstlane_b32 s4, v34 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s40, v35 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_readfirstlane_b32 s61, v36 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s63, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204 -; SI-NEXT: v_writelane_b32 v43, s4, 17 +; SI-NEXT: v_writelane_b32 v44, s4, 17 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s59, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s42, v38 +; SI-NEXT: v_readfirstlane_b32 s56, v38 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s73, v39 +; SI-NEXT: v_readfirstlane_b32 s43, v39 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s21, v48 +; SI-NEXT: v_readfirstlane_b32 s46, v48 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s57, v49 +; SI-NEXT: v_readfirstlane_b32 s42, v49 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s13, v50 ; SI-NEXT: s_waitcnt vmcnt(6) @@ -154101,43 +154909,44 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s47, v32 +; SI-NEXT: v_readfirstlane_b32 s88, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s24, v33 +; SI-NEXT: v_readfirstlane_b32 s79, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s78, v34 +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_writelane_b32 v44, s4, 18 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: v_writelane_b32 v43, s4, 18 +; SI-NEXT: v_writelane_b32 v44, s4, 19 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: v_writelane_b32 v43, s4, 19 +; SI-NEXT: v_writelane_b32 v44, s4, 20 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: v_writelane_b32 v43, s4, 20 +; SI-NEXT: v_writelane_b32 v44, s4, 21 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v43, s4, 21 +; SI-NEXT: v_writelane_b32 v44, s4, 22 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v43, s4, 22 +; SI-NEXT: v_writelane_b32 v44, s4, 23 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v43, s4, 23 +; SI-NEXT: v_writelane_b32 v44, s4, 24 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_readfirstlane_b32 s4, v48 -; SI-NEXT: v_writelane_b32 v43, s4, 24 +; SI-NEXT: v_writelane_b32 v44, s4, 25 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v43, s4, 25 +; SI-NEXT: v_writelane_b32 v44, s4, 26 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v43, s4, 26 +; SI-NEXT: v_writelane_b32 v44, s4, 27 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s4, v51 -; SI-NEXT: v_writelane_b32 v43, s4, 27 +; SI-NEXT: v_writelane_b32 v44, s4, 28 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 ; SI-NEXT: s_waitcnt vmcnt(3) @@ -154153,42 +154962,41 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; SI-NEXT: v_writelane_b32 v43, s4, 28 +; SI-NEXT: v_writelane_b32 v44, s4, 29 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_readfirstlane_b32 s4, v52 -; SI-NEXT: v_writelane_b32 v43, s4, 29 +; SI-NEXT: v_writelane_b32 v44, s4, 30 ; SI-NEXT: v_readfirstlane_b32 s4, v53 -; SI-NEXT: v_writelane_b32 v43, s4, 30 +; SI-NEXT: v_writelane_b32 v44, s4, 31 ; SI-NEXT: v_readfirstlane_b32 s4, v54 -; SI-NEXT: v_writelane_b32 v43, s4, 31 +; SI-NEXT: v_writelane_b32 v44, s4, 32 ; SI-NEXT: v_readfirstlane_b32 s4, v55 -; SI-NEXT: v_writelane_b32 v43, s4, 32 -; SI-NEXT: v_readfirstlane_b32 s4, v40 -; SI-NEXT: v_writelane_b32 v43, s4, 33 -; SI-NEXT: v_writelane_b32 v43, s22, 34 -; SI-NEXT: v_writelane_b32 v43, s23, 35 -; SI-NEXT: v_writelane_b32 v43, s72, 36 -; SI-NEXT: v_writelane_b32 v43, s20, 37 -; SI-NEXT: v_writelane_b32 v43, s79, 38 -; SI-NEXT: v_writelane_b32 v43, s76, 39 -; SI-NEXT: v_writelane_b32 v43, s25, 40 -; SI-NEXT: v_writelane_b32 v43, s60, 41 -; SI-NEXT: v_writelane_b32 v43, s29, 42 -; SI-NEXT: v_writelane_b32 v43, s77, 43 -; SI-NEXT: v_writelane_b32 v43, s16, 44 -; SI-NEXT: v_writelane_b32 v43, s17, 45 -; SI-NEXT: v_writelane_b32 v43, s18, 46 -; SI-NEXT: v_writelane_b32 v43, s19, 47 -; SI-NEXT: v_writelane_b32 v43, s88, 48 -; SI-NEXT: v_writelane_b32 v43, s89, 49 -; SI-NEXT: v_writelane_b32 v43, s90, 50 -; SI-NEXT: v_writelane_b32 v43, s91, 51 -; SI-NEXT: v_writelane_b32 v43, s92, 52 -; SI-NEXT: v_writelane_b32 v43, s93, 53 -; SI-NEXT: v_writelane_b32 v43, s94, 54 -; SI-NEXT: v_writelane_b32 v43, s95, 55 +; SI-NEXT: v_writelane_b32 v44, s4, 33 +; SI-NEXT: v_writelane_b32 v44, s22, 34 +; SI-NEXT: v_writelane_b32 v44, s23, 35 +; SI-NEXT: v_writelane_b32 v44, s73, 36 +; SI-NEXT: v_writelane_b32 v44, s20, 37 +; SI-NEXT: v_writelane_b32 v44, s47, 38 +; SI-NEXT: v_writelane_b32 v44, s76, 39 +; SI-NEXT: v_writelane_b32 v44, s25, 40 +; SI-NEXT: v_writelane_b32 v44, s57, 41 +; SI-NEXT: v_writelane_b32 v44, s74, 42 +; SI-NEXT: v_writelane_b32 v44, s78, 43 +; SI-NEXT: v_writelane_b32 v44, s24, 44 +; SI-NEXT: v_writelane_b32 v44, s16, 45 +; SI-NEXT: v_writelane_b32 v44, s17, 46 +; SI-NEXT: v_writelane_b32 v44, s18, 47 +; SI-NEXT: v_writelane_b32 v44, s19, 48 +; SI-NEXT: v_writelane_b32 v44, s77, 49 +; SI-NEXT: v_writelane_b32 v44, s89, 50 +; SI-NEXT: v_writelane_b32 v44, s90, 51 +; SI-NEXT: v_writelane_b32 v44, s91, 52 +; SI-NEXT: v_writelane_b32 v44, s92, 53 +; SI-NEXT: v_writelane_b32 v44, s93, 54 +; SI-NEXT: v_writelane_b32 v44, s94, 55 +; SI-NEXT: v_writelane_b32 v44, s95, 56 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s62, v33 +; SI-NEXT: v_readfirstlane_b32 s58, v33 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s10, v34 ; SI-NEXT: s_waitcnt vmcnt(8) @@ -154196,7 +155004,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readfirstlane_b32 s28, v31 ; SI-NEXT: v_readfirstlane_b32 s27, v32 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s58, v36 +; SI-NEXT: v_readfirstlane_b32 s29, v36 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s69, v37 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -154227,32 +155035,31 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_readfirstlane_b32 vcc_lo, v12 ; SI-NEXT: v_readfirstlane_b32 vcc_hi, v11 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 56 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 57 -; SI-NEXT: v_writelane_b32 v43, s30, 58 -; SI-NEXT: v_writelane_b32 v43, s31, 59 -; SI-NEXT: v_writelane_b32 v43, s34, 60 -; SI-NEXT: v_writelane_b32 v43, s35, 61 -; SI-NEXT: v_writelane_b32 v43, s36, 62 -; SI-NEXT: v_writelane_b32 v43, s37, 63 +; SI-NEXT: v_writelane_b32 v44, vcc_lo, 57 +; SI-NEXT: v_writelane_b32 v44, vcc_hi, 58 +; SI-NEXT: v_writelane_b32 v44, s30, 59 +; SI-NEXT: v_writelane_b32 v44, s31, 60 +; SI-NEXT: v_writelane_b32 v44, s34, 61 +; SI-NEXT: v_writelane_b32 v44, s35, 62 +; SI-NEXT: v_writelane_b32 v44, s36, 63 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s74, v31 +; SI-NEXT: v_readfirstlane_b32 s60, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s46, v32 +; SI-NEXT: v_readfirstlane_b32 s62, v32 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s96, v33 +; SI-NEXT: v_readfirstlane_b32 s83, v33 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s98, v34 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s41, v35 +; SI-NEXT: v_readfirstlane_b32 s81, v35 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s56, v36 +; SI-NEXT: v_readfirstlane_b32 s72, v36 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s87, v37 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_readfirstlane_b32 s99, v38 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s81, v39 +; SI-NEXT: v_readfirstlane_b32 s82, v39 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 @@ -154264,9 +155071,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s26, v48 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s83, v49 +; SI-NEXT: v_readfirstlane_b32 s15, v49 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s82, v50 +; SI-NEXT: v_readfirstlane_b32 s96, v50 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s7, v51 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 @@ -154275,7 +155082,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s15, v31 +; SI-NEXT: v_readfirstlane_b32 s41, v31 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s97, v32 ; SI-NEXT: s_waitcnt vmcnt(10) @@ -154296,144 +155103,146 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readfirstlane_b32 s65, v48 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s64, v49 -; SI-NEXT: v_writelane_b32 v42, s64, 10 +; SI-NEXT: v_writelane_b32 v43, s64, 11 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_readfirstlane_b32 s67, v50 -; SI-NEXT: v_writelane_b32 v42, s65, 11 +; SI-NEXT: v_writelane_b32 v43, s65, 12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s84, v51 -; SI-NEXT: v_writelane_b32 v42, s67, 12 -; SI-NEXT: v_writelane_b32 v42, s84, 13 -; SI-NEXT: v_writelane_b32 v42, s85, 14 -; SI-NEXT: v_writelane_b32 v42, s86, 15 -; SI-NEXT: v_writelane_b32 v42, s87, 16 -; SI-NEXT: v_writelane_b32 v42, s8, 17 -; SI-NEXT: v_writelane_b32 v42, s99, 18 -; SI-NEXT: v_writelane_b32 v42, s12, 19 -; SI-NEXT: v_writelane_b32 v42, s44, 20 -; SI-NEXT: v_writelane_b32 v42, s97, 21 -; SI-NEXT: v_writelane_b32 v42, s83, 22 -; SI-NEXT: v_writelane_b32 v42, s82, 23 -; SI-NEXT: v_writelane_b32 v42, s98, 24 -; SI-NEXT: v_writelane_b32 v42, s96, 25 -; SI-NEXT: v_writelane_b32 v42, s81, 26 -; SI-NEXT: v_writelane_b32 v42, s9, 27 -; SI-NEXT: v_writelane_b32 v42, s41, 28 -; SI-NEXT: v_writelane_b32 v42, s80, 29 -; SI-NEXT: v_writelane_b32 v42, s7, 30 -; SI-NEXT: v_writelane_b32 v42, s56, 31 -; SI-NEXT: v_writelane_b32 v42, s26, 32 -; SI-NEXT: v_writelane_b32 v42, s15, 33 -; SI-NEXT: v_writelane_b32 v42, s14, 34 -; SI-NEXT: v_writelane_b32 v42, s69, 35 -; SI-NEXT: v_writelane_b32 v42, s71, 36 -; SI-NEXT: v_writelane_b32 v42, s70, 37 -; SI-NEXT: v_writelane_b32 v42, s68, 38 -; SI-NEXT: v_writelane_b32 v42, s74, 39 -; SI-NEXT: v_writelane_b32 v42, s46, 40 -; SI-NEXT: v_writelane_b32 v42, s11, 41 -; SI-NEXT: v_writelane_b32 v42, s10, 42 -; SI-NEXT: v_writelane_b32 v42, s62, 43 -; SI-NEXT: v_writelane_b32 v42, s66, 44 -; SI-NEXT: v_writelane_b32 v42, s58, 45 -; SI-NEXT: v_writelane_b32 v42, s28, 46 -; SI-NEXT: v_writelane_b32 v42, s27, 47 -; SI-NEXT: v_writelane_b32 v42, s78, 48 -; SI-NEXT: v_writelane_b32 v42, s24, 49 +; SI-NEXT: v_writelane_b32 v43, s67, 13 +; SI-NEXT: v_writelane_b32 v43, s84, 14 +; SI-NEXT: v_writelane_b32 v43, s85, 15 +; SI-NEXT: v_writelane_b32 v43, s86, 16 +; SI-NEXT: v_writelane_b32 v43, s87, 17 +; SI-NEXT: v_writelane_b32 v43, s8, 18 +; SI-NEXT: v_writelane_b32 v43, s99, 19 +; SI-NEXT: v_writelane_b32 v43, s12, 20 +; SI-NEXT: v_writelane_b32 v43, s44, 21 +; SI-NEXT: v_writelane_b32 v43, s97, 22 +; SI-NEXT: v_writelane_b32 v43, s15, 23 +; SI-NEXT: v_writelane_b32 v43, s96, 24 +; SI-NEXT: v_writelane_b32 v43, s98, 25 +; SI-NEXT: v_writelane_b32 v43, s83, 26 +; SI-NEXT: v_writelane_b32 v43, s82, 27 +; SI-NEXT: v_writelane_b32 v43, s9, 28 +; SI-NEXT: v_writelane_b32 v43, s81, 29 +; SI-NEXT: v_writelane_b32 v43, s80, 30 +; SI-NEXT: v_writelane_b32 v43, s7, 31 +; SI-NEXT: v_writelane_b32 v43, s72, 32 +; SI-NEXT: v_writelane_b32 v43, s26, 33 +; SI-NEXT: v_writelane_b32 v43, s41, 34 +; SI-NEXT: v_writelane_b32 v43, s14, 35 +; SI-NEXT: v_writelane_b32 v43, s69, 36 +; SI-NEXT: v_writelane_b32 v43, s71, 37 +; SI-NEXT: v_writelane_b32 v43, s70, 38 +; SI-NEXT: v_writelane_b32 v43, s68, 39 +; SI-NEXT: v_writelane_b32 v43, s60, 40 +; SI-NEXT: v_writelane_b32 v43, s62, 41 +; SI-NEXT: v_writelane_b32 v43, s11, 42 +; SI-NEXT: v_writelane_b32 v43, s10, 43 +; SI-NEXT: v_writelane_b32 v43, s58, 44 +; SI-NEXT: v_writelane_b32 v43, s66, 45 +; SI-NEXT: v_writelane_b32 v43, s29, 46 +; SI-NEXT: v_writelane_b32 v43, s28, 47 +; SI-NEXT: v_writelane_b32 v43, s27, 48 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readlane_b32 s4, v43, 3 +; SI-NEXT: v_readlane_b32 s4, v44, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s5, v43, 2 +; SI-NEXT: v_readlane_b32 s5, v44, 2 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_writelane_b32 v42, s4, 56 -; SI-NEXT: v_readlane_b32 s4, v43, 1 +; SI-NEXT: v_writelane_b32 v43, s4, 58 +; SI-NEXT: v_readlane_b32 s4, v44, 1 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s5, v43, 0 +; SI-NEXT: v_readlane_b32 s5, v44, 0 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_writelane_b32 v42, s4, 57 +; SI-NEXT: v_writelane_b32 v43, s4, 59 ; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_lshl_b32 s5, s73, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s22, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_mov_b32 s22, s6 ; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: v_writelane_b32 v42, s4, 58 +; SI-NEXT: v_writelane_b32 v43, s4, 60 ; SI-NEXT: s_or_b32 s4, s6, s5 -; SI-NEXT: s_and_b32 s5, s60, 0xff +; SI-NEXT: s_and_b32 s5, s57, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s25, 24 -; SI-NEXT: v_writelane_b32 v42, s4, 59 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_writelane_b32 v42, s5, 60 -; SI-NEXT: s_and_b32 s5, s79, 0xff +; SI-NEXT: v_writelane_b32 v43, s4, 61 +; SI-NEXT: s_or_b32 s4, s6, s5 +; SI-NEXT: s_and_b32 s5, s47, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s76, 24 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_writelane_b32 v42, s5, 61 -; SI-NEXT: s_and_b32 s5, s77, 0xff -; SI-NEXT: s_lshl_b32 s6, s29, 8 +; SI-NEXT: v_writelane_b32 v43, s4, 62 +; SI-NEXT: s_or_b32 s4, s6, s5 +; SI-NEXT: s_and_b32 s5, s78, 0xff +; SI-NEXT: s_lshl_b32 s6, s74, 8 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s16, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s17, 24 -; SI-NEXT: s_or_b32 s6, s16, s6 -; SI-NEXT: v_writelane_b32 v42, s6, 62 +; SI-NEXT: v_writelane_b32 v43, s4, 63 +; SI-NEXT: s_or_b32 s4, s16, s6 ; SI-NEXT: s_and_b32 s6, s89, 0xff +; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s16, s88, 24 -; SI-NEXT: s_mov_b32 s4, s47 -; SI-NEXT: s_or_b32 s47, s16, s6 +; SI-NEXT: s_lshl_b32 s16, s77, 24 +; SI-NEXT: v_writelane_b32 v42, s4, 0 +; SI-NEXT: s_or_b32 s6, s16, s6 +; SI-NEXT: v_writelane_b32 v42, s6, 1 ; SI-NEXT: s_and_b32 s6, s18, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s19, 24 -; SI-NEXT: s_or_b32 s25, s16, s6 +; SI-NEXT: s_or_b32 s76, s16, s6 ; SI-NEXT: s_and_b32 s6, s93, 0xff ; SI-NEXT: s_lshl_b32 s16, s92, 8 ; SI-NEXT: s_or_b32 s6, s6, s16 ; SI-NEXT: s_and_b32 s16, s90, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, s91, 24 -; SI-NEXT: s_or_b32 s92, s17, s16 +; SI-NEXT: s_or_b32 s77, s17, s16 ; SI-NEXT: s_and_b32 s16, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, vcc_lo, 24 -; SI-NEXT: s_or_b32 s76, s17, s16 +; SI-NEXT: s_or_b32 s25, s17, s16 ; SI-NEXT: s_and_b32 s16, s94, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, s95, 24 -; SI-NEXT: s_or_b32 s91, s17, s16 +; SI-NEXT: s_or_b32 s74, s17, s16 ; SI-NEXT: s_and_b32 s16, s35, 0xff ; SI-NEXT: s_lshl_b32 s17, s34, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s17, s30, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s31, 24 -; SI-NEXT: s_or_b32 s77, s18, s17 +; SI-NEXT: s_or_b32 s78, s18, s17 ; SI-NEXT: s_and_b32 s17, s39, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s38, 24 -; SI-NEXT: s_or_b32 s79, s18, s17 +; SI-NEXT: s_mov_b32 s31, s88 +; SI-NEXT: s_or_b32 s88, s18, s17 ; SI-NEXT: s_and_b32 s17, s36, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s37, 24 -; SI-NEXT: s_or_b32 s93, s18, s17 +; SI-NEXT: s_or_b32 s89, s18, s17 ; SI-NEXT: s_and_b32 s17, s51, 0xff ; SI-NEXT: s_lshl_b32 s18, s50, 8 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s18, s48, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s49, 24 -; SI-NEXT: s_or_b32 s89, s19, s18 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: v_writelane_b32 v43, s18, 49 ; SI-NEXT: s_and_b32 s18, s55, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s54, 24 -; SI-NEXT: s_or_b32 s31, s19, s18 +; SI-NEXT: s_mov_b32 s73, s79 +; SI-NEXT: s_or_b32 s79, s19, s18 ; SI-NEXT: s_and_b32 s18, s52, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s53, 24 @@ -154444,7 +155253,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s19, s64, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s65, 24 -; SI-NEXT: s_or_b32 s60, s20, s19 +; SI-NEXT: s_or_b32 s95, s20, s19 ; SI-NEXT: s_and_b32 s19, s12, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s8, 24 @@ -154460,217 +155269,226 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s97, 24 ; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s15, 0xff +; SI-NEXT: s_and_b32 s19, s41, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s7, 24 ; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s82, 0xff +; SI-NEXT: s_and_b32 s19, s96, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s83, 24 -; SI-NEXT: s_or_b32 s23, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s15, 24 +; SI-NEXT: v_writelane_b32 v43, s12, 50 +; SI-NEXT: s_or_b32 s12, s20, s19 ; SI-NEXT: s_and_b32 s19, s26, 0xff -; SI-NEXT: s_lshl_b32 s20, s81, 8 +; SI-NEXT: s_lshl_b32 s20, s82, 8 ; SI-NEXT: s_or_b32 vcc_hi, s19, s20 ; SI-NEXT: s_and_b32 s19, s99, 0xff -; SI-NEXT: v_writelane_b32 v42, s9, 50 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s87, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 51 -; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s56, 0xff +; SI-NEXT: v_writelane_b32 v43, s9, 51 +; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: s_and_b32 s19, s72, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s41, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 52 -; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s81, 24 +; SI-NEXT: v_writelane_b32 v43, s9, 52 +; SI-NEXT: s_or_b32 s9, s20, s19 ; SI-NEXT: s_and_b32 s19, s98, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s96, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 54 -; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s46, 0xff -; SI-NEXT: s_lshl_b32 s20, s74, 8 +; SI-NEXT: s_lshl_b32 s20, s83, 24 +; SI-NEXT: v_writelane_b32 v43, s9, 54 +; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: s_and_b32 s19, s62, 0xff +; SI-NEXT: s_lshl_b32 s20, s60, 8 ; SI-NEXT: s_or_b32 s84, s19, s20 ; SI-NEXT: s_and_b32 s19, s71, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s70, 24 -; SI-NEXT: s_or_b32 s72, s20, s19 +; SI-NEXT: v_writelane_b32 v43, s9, 53 +; SI-NEXT: s_or_b32 s9, s20, s19 ; SI-NEXT: s_and_b32 s19, s11, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s68, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 53 -; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_or_b32 s57, s20, s19 ; SI-NEXT: s_and_b32 s19, s14, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s69, 24 +; SI-NEXT: v_writelane_b32 v43, s9, 55 ; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s58, 0xff +; SI-NEXT: s_and_b32 s19, s29, 0xff ; SI-NEXT: s_lshl_b32 s20, s66, 8 ; SI-NEXT: s_or_b32 s85, s19, s20 ; SI-NEXT: s_and_b32 s19, s10, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s62, 24 -; SI-NEXT: s_or_b32 s49, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s58, 24 +; SI-NEXT: v_writelane_b32 v43, s9, 56 +; SI-NEXT: s_or_b32 s9, s20, s19 ; SI-NEXT: s_and_b32 s19, s27, 0xff -; SI-NEXT: v_writelane_b32 v42, s9, 55 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s28, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 33 -; SI-NEXT: s_or_b32 s50, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 32 +; SI-NEXT: v_writelane_b32 v43, s9, 57 +; SI-NEXT: s_or_b32 s23, s20, s19 +; SI-NEXT: s_and_b32 s19, s24, 0xff +; SI-NEXT: v_readlane_b32 s9, v44, 33 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 31 -; SI-NEXT: s_or_b32 s51, s20, s19 +; SI-NEXT: v_readlane_b32 s9, v44, 32 +; SI-NEXT: s_or_b32 s10, s20, s19 ; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 30 +; SI-NEXT: v_readlane_b32 s9, v44, 31 ; SI-NEXT: s_lshl_b32 s20, s9, 8 -; SI-NEXT: v_readlane_b32 s9, v43, 29 +; SI-NEXT: v_readlane_b32 s9, v44, 30 ; SI-NEXT: s_or_b32 s86, s19, s20 ; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 28 +; SI-NEXT: v_readlane_b32 s9, v44, 29 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 27 -; SI-NEXT: s_or_b32 s52, s20, s19 +; SI-NEXT: v_readlane_b32 s9, v44, 28 +; SI-NEXT: s_or_b32 s47, s20, s19 ; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 26 +; SI-NEXT: v_readlane_b32 s9, v44, 27 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 25 -; SI-NEXT: s_or_b32 s53, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 24 +; SI-NEXT: v_readlane_b32 s11, v44, 26 +; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v44, 25 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 23 -; SI-NEXT: s_or_b32 s54, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 22 -; SI-NEXT: s_lshl_b32 s20, s9, 8 -; SI-NEXT: v_readlane_b32 s9, v43, 21 +; SI-NEXT: s_lshl_b32 s20, s11, 24 +; SI-NEXT: v_readlane_b32 s11, v44, 24 +; SI-NEXT: s_or_b32 s24, s20, s19 +; SI-NEXT: s_mov_b32 s92, s11 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v44, 23 +; SI-NEXT: s_mov_b32 s36, s11 +; SI-NEXT: s_lshl_b32 s20, s11, 8 +; SI-NEXT: v_readlane_b32 s11, v44, 22 ; SI-NEXT: s_or_b32 s87, s19, s20 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 20 +; SI-NEXT: s_mov_b32 s62, s11 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v44, 21 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 19 -; SI-NEXT: s_or_b32 s55, s20, s19 -; SI-NEXT: s_mov_b32 s58, s9 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 18 +; SI-NEXT: s_mov_b32 s30, s11 +; SI-NEXT: s_lshl_b32 s20, s11, 24 +; SI-NEXT: v_readlane_b32 s11, v44, 20 +; SI-NEXT: s_or_b32 s58, s20, s19 +; SI-NEXT: s_mov_b32 s91, s11 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v44, 19 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: s_or_b32 s64, s20, s19 -; SI-NEXT: s_and_b32 s19, s78, 0xff +; SI-NEXT: s_mov_b32 s35, s11 +; SI-NEXT: s_lshl_b32 s20, s11, 24 +; SI-NEXT: v_readlane_b32 s11, v44, 18 +; SI-NEXT: s_mov_b32 s4, s46 +; SI-NEXT: s_or_b32 s46, s20, s19 +; SI-NEXT: s_and_b32 s19, s11, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s24, 24 -; SI-NEXT: s_or_b32 s65, s20, s19 -; SI-NEXT: s_and_b32 s19, s4, 0xff +; SI-NEXT: s_lshl_b32 s20, s73, 24 +; SI-NEXT: s_mov_b32 s52, s73 +; SI-NEXT: s_or_b32 s73, s20, s19 +; SI-NEXT: s_and_b32 s19, s31, 0xff ; SI-NEXT: s_lshl_b32 s20, s45, 8 ; SI-NEXT: s_or_b32 s26, s19, s20 ; SI-NEXT: s_and_b32 s19, s13, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s57, 24 -; SI-NEXT: s_or_b32 s66, s20, s19 -; SI-NEXT: s_and_b32 s19, s21, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s73, 24 +; SI-NEXT: s_lshl_b32 s20, s42, 24 ; SI-NEXT: s_or_b32 s67, s20, s19 -; SI-NEXT: s_and_b32 s19, s42, 0xff -; SI-NEXT: v_readlane_b32 s88, v43, 17 +; SI-NEXT: s_and_b32 s19, s4, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s43, 24 +; SI-NEXT: s_mov_b32 s53, s42 +; SI-NEXT: s_or_b32 s42, s20, s19 +; SI-NEXT: s_and_b32 s19, s56, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s59, 24 ; SI-NEXT: s_or_b32 s68, s20, s19 ; SI-NEXT: s_and_b32 s19, s63, 0xff -; SI-NEXT: s_lshl_b32 s20, s88, 8 +; SI-NEXT: s_lshl_b32 s20, s61, 8 +; SI-NEXT: v_readlane_b32 s93, v44, 17 ; SI-NEXT: s_or_b32 s27, s19, s20 ; SI-NEXT: s_and_b32 s19, s40, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s43, 24 -; SI-NEXT: s_or_b32 s69, s20, s19 -; SI-NEXT: s_and_b32 s19, s61, 0xff -; SI-NEXT: s_mov_b32 s39, s57 -; SI-NEXT: s_mov_b32 s57, s7 +; SI-NEXT: s_lshl_b32 s20, s93, 24 +; SI-NEXT: s_or_b32 s70, s20, s19 +; SI-NEXT: s_and_b32 s19, s21, 0xff +; SI-NEXT: s_mov_b32 s51, s59 +; SI-NEXT: s_mov_b32 s59, s7 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s75, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 16 -; SI-NEXT: s_or_b32 s70, s20, s19 +; SI-NEXT: v_readlane_b32 s7, v44, 16 +; SI-NEXT: s_mov_b32 s48, s56 +; SI-NEXT: s_mov_b32 s56, s10 +; SI-NEXT: s_or_b32 s69, s20, s19 ; SI-NEXT: s_mov_b32 s10, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 15 +; SI-NEXT: v_readlane_b32 s7, v44, 15 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_mov_b32 s71, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 14 -; SI-NEXT: s_or_b32 s62, s20, s19 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 13 +; SI-NEXT: v_readlane_b32 s7, v44, 14 +; SI-NEXT: s_mov_b32 s39, s75 +; SI-NEXT: s_mov_b32 s75, s94 +; SI-NEXT: s_or_b32 s94, s20, s19 ; SI-NEXT: s_mov_b32 s41, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 13 +; SI-NEXT: s_mov_b32 s14, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 12 +; SI-NEXT: v_readlane_b32 s7, v44, 12 ; SI-NEXT: s_or_b32 s29, s19, s20 -; SI-NEXT: s_mov_b32 s14, s7 +; SI-NEXT: s_mov_b32 s81, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 11 +; SI-NEXT: v_readlane_b32 s7, v44, 11 +; SI-NEXT: s_mov_b32 s55, s45 +; SI-NEXT: s_mov_b32 s45, s9 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 10 -; SI-NEXT: s_or_b32 s80, s20, s19 -; SI-NEXT: s_mov_b32 s56, s7 +; SI-NEXT: v_readlane_b32 s7, v44, 10 +; SI-NEXT: s_mov_b32 s38, s11 +; SI-NEXT: s_or_b32 s11, s20, s19 +; SI-NEXT: s_mov_b32 s72, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 9 +; SI-NEXT: v_readlane_b32 s7, v44, 9 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s81, s7 -; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 8 -; SI-NEXT: s_or_b32 s11, s20, s19 ; SI-NEXT: s_mov_b32 s82, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 8 +; SI-NEXT: s_or_b32 s80, s20, s19 +; SI-NEXT: s_mov_b32 s83, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 7 +; SI-NEXT: v_readlane_b32 s7, v44, 7 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_mov_b32 s96, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 6 -; SI-NEXT: s_mov_b32 s36, s63 -; SI-NEXT: s_mov_b32 s63, s93 -; SI-NEXT: s_mov_b32 s93, s61 -; SI-NEXT: s_mov_b32 s61, s91 -; SI-NEXT: s_mov_b32 s91, s75 -; SI-NEXT: s_mov_b32 s75, s92 -; SI-NEXT: s_or_b32 s92, s20, s19 +; SI-NEXT: v_readlane_b32 s7, v44, 6 +; SI-NEXT: s_mov_b32 s90, s31 +; SI-NEXT: s_or_b32 s31, s20, s19 ; SI-NEXT: s_mov_b32 s98, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 5 +; SI-NEXT: v_readlane_b32 s7, v44, 5 ; SI-NEXT: s_mov_b32 s44, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 4 -; SI-NEXT: s_mov_b32 s48, s13 -; SI-NEXT: s_mov_b32 s13, s94 -; SI-NEXT: s_mov_b32 s94, s21 +; SI-NEXT: v_readlane_b32 s7, v44, 4 +; SI-NEXT: s_mov_b32 s37, s43 +; SI-NEXT: s_mov_b32 s43, s93 +; SI-NEXT: s_mov_b32 s93, s21 ; SI-NEXT: s_or_b32 s21, s19, s20 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: s_mov_b32 s95, s4 +; SI-NEXT: s_mov_b32 s34, s4 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s22, 24 -; SI-NEXT: v_readlane_b32 s4, v42, 58 -; SI-NEXT: s_mov_b32 s46, s45 -; SI-NEXT: s_mov_b32 s34, s73 -; SI-NEXT: s_mov_b32 s73, s12 -; SI-NEXT: s_mov_b32 s37, s42 -; SI-NEXT: s_mov_b32 s38, s59 -; SI-NEXT: s_mov_b32 s59, s8 -; SI-NEXT: s_mov_b32 s30, s88 -; SI-NEXT: s_mov_b32 s88, s31 -; SI-NEXT: s_mov_b32 s78, s40 -; SI-NEXT: s_mov_b32 s31, s43 +; SI-NEXT: v_readlane_b32 s4, v43, 60 +; SI-NEXT: s_mov_b32 s54, s13 +; SI-NEXT: s_mov_b32 s13, s12 +; SI-NEXT: s_mov_b32 s50, s63 +; SI-NEXT: s_mov_b32 s63, s95 +; SI-NEXT: s_mov_b32 s49, s61 +; SI-NEXT: s_mov_b32 s61, s8 +; SI-NEXT: s_mov_b32 s60, s40 ; SI-NEXT: s_mov_b32 s12, s7 ; SI-NEXT: s_mov_b32 s7, s22 -; SI-NEXT: s_or_b32 s83, s20, s19 +; SI-NEXT: s_or_b32 s15, s20, s19 ; SI-NEXT: s_lshl_b32 s20, s4, 16 -; SI-NEXT: s_lshl_b32 s74, s5, 16 +; SI-NEXT: s_lshl_b32 s95, s5, 16 ; SI-NEXT: s_lshl_b32 s22, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s19, s17, 16 @@ -154682,16 +155500,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_lshl_b32 s97, s86, 16 ; SI-NEXT: s_lshl_b32 s28, s87, 16 ; SI-NEXT: s_lshl_b32 s87, s26, 16 -; SI-NEXT: v_readlane_b32 s26, v42, 56 +; SI-NEXT: v_readlane_b32 s26, v43, 58 ; SI-NEXT: s_lshl_b32 s86, s27, 16 -; SI-NEXT: v_readlane_b32 s27, v42, 57 -; SI-NEXT: v_readlane_b32 s35, v42, 61 +; SI-NEXT: v_readlane_b32 s27, v43, 59 +; SI-NEXT: v_readlane_b32 s66, v43, 63 ; SI-NEXT: s_lshl_b32 s85, s29, 16 -; SI-NEXT: v_readlane_b32 s29, v42, 60 -; SI-NEXT: v_readlane_b32 s24, v42, 59 -; SI-NEXT: v_readlane_b32 s90, v42, 62 +; SI-NEXT: v_readlane_b32 s29, v43, 62 +; SI-NEXT: v_readlane_b32 s65, v43, 61 +; SI-NEXT: v_readlane_b32 s64, v42, 0 ; SI-NEXT: s_lshl_b32 s84, s21, 16 -; SI-NEXT: s_mov_b32 s21, s47 +; SI-NEXT: v_readlane_b32 s21, v42, 1 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true ; SI-NEXT: s_add_i32 s4, s98, 3 @@ -154706,10 +155524,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s5, s56, 3 +; SI-NEXT: s_add_i32 s5, s72, 3 ; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: s_lshl_b32 s6, s81, 8 -; SI-NEXT: s_add_i32 s16, s82, 3 +; SI-NEXT: s_lshl_b32 s6, s82, 8 +; SI-NEXT: s_add_i32 s16, s83, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s6, s96, 24 @@ -154718,10 +155536,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s6, s6, s16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_add_i32 s6, s15, 3 +; SI-NEXT: s_add_i32 s6, s41, 3 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s16, s41, 8 -; SI-NEXT: s_add_i32 s17, s14, 3 +; SI-NEXT: s_lshl_b32 s16, s14, 8 +; SI-NEXT: s_add_i32 s17, s81, 3 ; SI-NEXT: s_or_b32 s6, s16, s6 ; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: s_lshl_b32 s16, s9, 24 @@ -154732,7 +155550,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s6, s16, s6 ; SI-NEXT: s_add_i32 s16, s93, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s91, 8 +; SI-NEXT: s_lshl_b32 s17, s39, 8 ; SI-NEXT: s_add_i32 s18, s10, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_and_b32 s18, s18, 0xff @@ -154742,150 +155560,143 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s17, s36, 3 +; SI-NEXT: s_add_i32 s17, s50, 3 ; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: s_lshl_b32 s18, s30, 8 -; SI-NEXT: s_add_i32 s19, s78, 3 +; SI-NEXT: s_lshl_b32 s18, s49, 8 +; SI-NEXT: s_add_i32 s19, s60, 3 ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s18, s31, 24 +; SI-NEXT: s_lshl_b32 s18, s43, 24 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_addk_i32 s17, 0x300 ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_add_i32 s18, s94, 3 +; SI-NEXT: s_add_i32 s18, s34, 3 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s19, s34, 8 -; SI-NEXT: s_add_i32 s20, s37, 3 +; SI-NEXT: s_lshl_b32 s19, s37, 8 +; SI-NEXT: s_add_i32 s20, s48, 3 ; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s19, s38, 24 +; SI-NEXT: s_lshl_b32 s19, s51, 24 ; SI-NEXT: s_lshl_b32 s20, s20, 16 ; SI-NEXT: s_addk_i32 s18, 0x300 ; SI-NEXT: s_or_b32 s19, s19, s20 ; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_add_i32 s19, s95, 3 +; SI-NEXT: s_add_i32 s19, s90, 3 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s20, s46, 8 -; SI-NEXT: s_add_i32 s22, s48, 3 +; SI-NEXT: s_lshl_b32 s20, s55, 8 +; SI-NEXT: s_add_i32 s22, s54, 3 ; SI-NEXT: s_or_b32 s19, s20, s19 ; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_lshl_b32 s20, s39, 24 +; SI-NEXT: s_lshl_b32 s20, s53, 24 ; SI-NEXT: s_lshl_b32 s22, s22, 16 ; SI-NEXT: s_addk_i32 s19, 0x300 ; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: s_and_b32 s19, s19, 0xffff ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: s_add_i32 s20, s58, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 18 +; SI-NEXT: s_add_i32 s20, s91, 3 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s22, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 49 +; SI-NEXT: s_lshl_b32 s22, s35, 8 +; SI-NEXT: s_add_i32 s23, s38, 3 ; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: s_lshl_b32 s22, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 48 -; SI-NEXT: s_add_i32 s23, s7, 3 ; SI-NEXT: s_and_b32 s23, s23, 0xff +; SI-NEXT: s_lshl_b32 s22, s52, 24 ; SI-NEXT: s_lshl_b32 s23, s23, 16 ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: v_readlane_b32 s7, v43, 23 ; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: s_add_i32 s22, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 22 +; SI-NEXT: s_add_i32 s22, s92, 3 ; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_lshl_b32 s23, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 20 +; SI-NEXT: s_lshl_b32 s23, s36, 8 +; SI-NEXT: s_add_i32 s60, s62, 3 ; SI-NEXT: s_or_b32 s22, s23, s22 -; SI-NEXT: s_lshl_b32 s23, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 21 -; SI-NEXT: s_add_i32 s60, s7, 3 ; SI-NEXT: s_and_b32 s60, s60, 0xff +; SI-NEXT: s_lshl_b32 s23, s30, 24 ; SI-NEXT: s_lshl_b32 s60, s60, 16 ; SI-NEXT: s_addk_i32 s22, 0x300 ; SI-NEXT: s_or_b32 s23, s23, s60 ; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: v_readlane_b32 s7, v43, 27 +; SI-NEXT: v_readlane_b32 s7, v44, 28 ; SI-NEXT: s_or_b32 s22, s23, s22 ; SI-NEXT: s_add_i32 s23, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 26 +; SI-NEXT: v_readlane_b32 s7, v44, 27 ; SI-NEXT: s_and_b32 s23, s23, 0xff ; SI-NEXT: s_lshl_b32 s60, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 25 ; SI-NEXT: s_or_b32 s23, s60, s23 ; SI-NEXT: s_lshl_b32 s60, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 25 +; SI-NEXT: v_readlane_b32 s7, v44, 26 ; SI-NEXT: s_add_i32 s61, s7, 3 ; SI-NEXT: s_and_b32 s61, s61, 0xff ; SI-NEXT: s_lshl_b32 s61, s61, 16 ; SI-NEXT: s_addk_i32 s23, 0x300 ; SI-NEXT: s_or_b32 s60, s60, s61 ; SI-NEXT: s_and_b32 s23, s23, 0xffff -; SI-NEXT: v_readlane_b32 s7, v43, 31 +; SI-NEXT: v_readlane_b32 s7, v44, 32 ; SI-NEXT: s_or_b32 s23, s60, s23 ; SI-NEXT: s_add_i32 s60, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 30 +; SI-NEXT: v_readlane_b32 s7, v44, 31 ; SI-NEXT: s_and_b32 s60, s60, 0xff ; SI-NEXT: s_lshl_b32 s61, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 28 +; SI-NEXT: v_readlane_b32 s7, v44, 29 ; SI-NEXT: s_or_b32 s60, s61, s60 ; SI-NEXT: s_lshl_b32 s61, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 29 +; SI-NEXT: v_readlane_b32 s7, v44, 30 ; SI-NEXT: s_add_i32 s62, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 47 +; SI-NEXT: v_readlane_b32 s7, v43, 48 ; SI-NEXT: s_and_b32 s62, s62, 0xff ; SI-NEXT: s_add_i32 s59, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 46 +; SI-NEXT: v_readlane_b32 s7, v43, 47 ; SI-NEXT: s_lshl_b32 s62, s62, 16 ; SI-NEXT: s_addk_i32 s60, 0x300 ; SI-NEXT: s_and_b32 s59, s59, 0xff ; SI-NEXT: s_lshl_b32 s58, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 32 +; SI-NEXT: v_readlane_b32 s7, v44, 33 ; SI-NEXT: s_or_b32 s61, s61, s62 ; SI-NEXT: s_and_b32 s60, s60, 0xffff ; SI-NEXT: s_or_b32 s58, s58, s59 ; SI-NEXT: s_lshl_b32 s59, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 33 +; SI-NEXT: v_readlane_b32 s7, v44, 44 ; SI-NEXT: s_or_b32 s60, s61, s60 ; SI-NEXT: s_add_i32 s61, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 45 +; SI-NEXT: v_readlane_b32 s7, v43, 46 ; SI-NEXT: s_add_i32 s57, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 44 +; SI-NEXT: v_readlane_b32 s7, v43, 45 ; SI-NEXT: s_lshl_b32 s56, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 43 +; SI-NEXT: v_readlane_b32 s7, v43, 44 ; SI-NEXT: s_lshl_b32 s47, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 42 +; SI-NEXT: v_readlane_b32 s7, v43, 43 ; SI-NEXT: s_add_i32 s46, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 41 +; SI-NEXT: v_readlane_b32 s7, v43, 42 ; SI-NEXT: s_add_i32 s45, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 38 +; SI-NEXT: v_readlane_b32 s7, v43, 39 ; SI-NEXT: s_lshl_b32 s42, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 35 +; SI-NEXT: v_readlane_b32 s7, v43, 36 ; SI-NEXT: s_lshl_b32 s15, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 34 +; SI-NEXT: v_readlane_b32 s7, v43, 35 ; SI-NEXT: s_and_b32 s45, s45, 0xff ; SI-NEXT: s_add_i32 s14, s7, 3 ; SI-NEXT: s_or_b32 s42, s42, s45 ; SI-NEXT: s_and_b32 s14, s14, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_addk_i32 s42, 0x300 -; SI-NEXT: v_readlane_b32 s7, v42, 40 +; SI-NEXT: v_readlane_b32 s7, v43, 41 ; SI-NEXT: s_and_b32 s57, s57, 0xff ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_and_b32 s15, s42, 0xffff ; SI-NEXT: s_add_i32 s44, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 39 +; SI-NEXT: v_readlane_b32 s7, v43, 40 ; SI-NEXT: s_or_b32 s56, s56, s57 ; SI-NEXT: s_or_b32 s57, s14, s15 ; SI-NEXT: s_and_b32 s14, s44, 0xff ; SI-NEXT: s_lshl_b32 s15, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 37 +; SI-NEXT: v_readlane_b32 s7, v43, 38 ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_lshl_b32 s15, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 36 +; SI-NEXT: v_readlane_b32 s7, v43, 37 ; SI-NEXT: s_add_i32 s40, s7, 3 ; SI-NEXT: s_and_b32 s61, s61, 0xff ; SI-NEXT: s_and_b32 s40, s40, 0xff @@ -154900,15 +155711,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s58, s59, s58 ; SI-NEXT: s_or_b32 s59, s15, s14 ; SI-NEXT: s_add_i32 s14, s6, 0x3000000 -; SI-NEXT: v_readlane_b32 s6, v42, 31 +; SI-NEXT: v_readlane_b32 s6, v43, 32 ; SI-NEXT: s_add_i32 s11, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 28 +; SI-NEXT: v_readlane_b32 s7, v43, 29 ; SI-NEXT: s_and_b32 s6, s11, 0xff ; SI-NEXT: s_lshl_b32 s8, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 25 +; SI-NEXT: v_readlane_b32 s7, v43, 26 ; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: s_lshl_b32 s8, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 25 ; SI-NEXT: s_add_i32 s24, s7, 3 ; SI-NEXT: s_and_b32 s11, s24, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 @@ -154916,47 +155727,47 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s8, s11 ; SI-NEXT: s_or_b32 s8, s8, s6 -; SI-NEXT: v_readlane_b32 s6, v42, 32 +; SI-NEXT: v_readlane_b32 s6, v43, 33 ; SI-NEXT: s_add_i32 s12, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 26 +; SI-NEXT: v_readlane_b32 s7, v43, 27 ; SI-NEXT: s_and_b32 s6, s12, 0xff ; SI-NEXT: s_lshl_b32 s11, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 16 +; SI-NEXT: v_readlane_b32 s7, v43, 17 ; SI-NEXT: s_or_b32 s6, s11, s6 ; SI-NEXT: s_lshl_b32 s11, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 18 +; SI-NEXT: v_readlane_b32 s7, v43, 19 ; SI-NEXT: s_add_i32 s12, s7, 3 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: v_readlane_b32 s7, v42, 33 +; SI-NEXT: v_readlane_b32 s7, v43, 34 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_add_i32 s13, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 30 +; SI-NEXT: v_readlane_b32 s7, v43, 31 ; SI-NEXT: s_or_b32 s6, s11, s6 ; SI-NEXT: s_and_b32 s11, s13, 0xff ; SI-NEXT: s_lshl_b32 s10, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 22 +; SI-NEXT: v_readlane_b32 s7, v43, 23 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: s_lshl_b32 s11, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 23 +; SI-NEXT: v_readlane_b32 s7, v43, 24 ; SI-NEXT: s_add_i32 s25, s7, 3 ; SI-NEXT: s_and_b32 s12, s25, 0xff ; SI-NEXT: s_addk_i32 s10, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 -; SI-NEXT: v_readlane_b32 s7, v42, 29 +; SI-NEXT: v_readlane_b32 s7, v43, 30 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_add_i32 s9, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 27 -; SI-NEXT: v_readlane_b32 s11, v42, 20 +; SI-NEXT: v_readlane_b32 s7, v43, 28 +; SI-NEXT: v_readlane_b32 s11, v43, 21 ; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_readlane_b32 s9, v42, 21 +; SI-NEXT: v_readlane_b32 s9, v43, 22 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_addk_i32 s7, 0x300 ; SI-NEXT: s_lshl_b32 s9, s9, 24 @@ -154964,15 +155775,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_or_b32 s9, s9, s11 ; SI-NEXT: s_or_b32 s7, s9, s7 -; SI-NEXT: v_readlane_b32 s9, v42, 19 +; SI-NEXT: v_readlane_b32 s9, v43, 20 ; SI-NEXT: s_add_i32 s21, s9, 3 -; SI-NEXT: v_readlane_b32 s11, v42, 17 -; SI-NEXT: v_readlane_b32 s12, v42, 14 +; SI-NEXT: v_readlane_b32 s11, v43, 18 +; SI-NEXT: v_readlane_b32 s12, v43, 15 ; SI-NEXT: s_and_b32 s9, s21, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 8 ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: v_readlane_b32 s11, v42, 15 +; SI-NEXT: v_readlane_b32 s11, v43, 16 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_addk_i32 s9, 0x300 ; SI-NEXT: s_lshl_b32 s11, s11, 24 @@ -154980,15 +155791,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: v_readlane_b32 s11, v42, 13 +; SI-NEXT: v_readlane_b32 s11, v43, 14 ; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: v_readlane_b32 s12, v42, 12 -; SI-NEXT: v_readlane_b32 s13, v42, 10 +; SI-NEXT: v_readlane_b32 s12, v43, 13 +; SI-NEXT: v_readlane_b32 s13, v43, 11 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readlane_b32 s12, v42, 11 +; SI-NEXT: v_readlane_b32 s12, v43, 12 ; SI-NEXT: s_and_b32 s13, s13, 0xff ; SI-NEXT: s_addk_i32 s11, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 24 @@ -154996,16 +155807,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readlane_b32 s12, v42, 9 +; SI-NEXT: v_readlane_b32 s12, v43, 10 ; SI-NEXT: s_add_i32 s15, s16, 0x3000000 ; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: v_readlane_b32 s13, v42, 8 -; SI-NEXT: v_readlane_b32 s16, v42, 6 +; SI-NEXT: v_readlane_b32 s13, v43, 9 +; SI-NEXT: v_readlane_b32 s16, v43, 7 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_lshl_b32 s13, s13, 8 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v42, 7 +; SI-NEXT: v_readlane_b32 s13, v43, 8 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_addk_i32 s12, 0x300 ; SI-NEXT: s_lshl_b32 s13, s13, 24 @@ -155013,16 +155824,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s13, s13, s16 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v42, 5 +; SI-NEXT: v_readlane_b32 s13, v43, 6 ; SI-NEXT: s_add_i32 s40, s17, 0x3000000 ; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: v_readlane_b32 s16, v42, 4 -; SI-NEXT: v_readlane_b32 s17, v42, 2 +; SI-NEXT: v_readlane_b32 s16, v43, 5 +; SI-NEXT: v_readlane_b32 s17, v43, 3 ; SI-NEXT: s_and_b32 s13, s13, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 8 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s13, s16, s13 -; SI-NEXT: v_readlane_b32 s16, v42, 3 +; SI-NEXT: v_readlane_b32 s16, v43, 4 ; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: s_addk_i32 s13, 0x300 ; SI-NEXT: s_lshl_b32 s16, s16, 24 @@ -155030,16 +155841,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_or_b32 s13, s16, s13 -; SI-NEXT: v_readlane_b32 s16, v42, 1 +; SI-NEXT: v_readlane_b32 s16, v43, 2 ; SI-NEXT: s_add_i32 s41, s18, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 0 -; SI-NEXT: v_readlane_b32 s18, v43, 62 +; SI-NEXT: v_readlane_b32 s17, v43, 1 +; SI-NEXT: v_readlane_b32 s18, v44, 63 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 63 +; SI-NEXT: v_readlane_b32 s17, v43, 0 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -155048,16 +155859,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s17, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 61 +; SI-NEXT: v_readlane_b32 s16, v44, 62 ; SI-NEXT: s_add_i32 s42, s19, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s18, v43, 60 -; SI-NEXT: v_readlane_b32 s19, v43, 58 +; SI-NEXT: v_readlane_b32 s18, v44, 61 +; SI-NEXT: v_readlane_b32 s19, v44, 59 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 8 ; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s16, s18, s16 -; SI-NEXT: v_readlane_b32 s18, v43, 59 +; SI-NEXT: v_readlane_b32 s18, v44, 60 ; SI-NEXT: s_and_b32 s19, s19, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s18, s18, 24 @@ -155065,16 +155876,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_or_b32 s16, s18, s16 -; SI-NEXT: v_readlane_b32 s18, v43, 57 +; SI-NEXT: v_readlane_b32 s18, v44, 58 ; SI-NEXT: s_add_i32 s43, s20, 0x3000000 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_readlane_b32 s19, v43, 56 -; SI-NEXT: v_readlane_b32 s20, v43, 54 +; SI-NEXT: v_readlane_b32 s19, v44, 57 +; SI-NEXT: v_readlane_b32 s20, v44, 55 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 8 ; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readlane_b32 s19, v43, 55 +; SI-NEXT: v_readlane_b32 s19, v44, 56 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_addk_i32 s18, 0x300 ; SI-NEXT: s_lshl_b32 s19, s19, 24 @@ -155082,15 +155893,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: s_or_b32 s19, s19, s20 ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readlane_b32 s19, v43, 53 +; SI-NEXT: v_readlane_b32 s19, v44, 54 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_readlane_b32 s20, v43, 52 -; SI-NEXT: v_readlane_b32 s21, v43, 50 +; SI-NEXT: v_readlane_b32 s20, v44, 53 +; SI-NEXT: v_readlane_b32 s21, v44, 51 ; SI-NEXT: s_and_b32 s19, s19, 0xff ; SI-NEXT: s_lshl_b32 s20, s20, 8 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: v_readlane_b32 s20, v43, 51 +; SI-NEXT: v_readlane_b32 s20, v44, 52 ; SI-NEXT: s_and_b32 s21, s21, 0xff ; SI-NEXT: s_addk_i32 s19, 0x300 ; SI-NEXT: s_lshl_b32 s20, s20, 24 @@ -155098,16 +155909,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s19, s19, 0xffff ; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: v_readlane_b32 s20, v43, 49 +; SI-NEXT: v_readlane_b32 s20, v44, 50 ; SI-NEXT: s_add_i32 s44, s22, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s21, v43, 48 -; SI-NEXT: v_readlane_b32 s22, v43, 46 +; SI-NEXT: v_readlane_b32 s21, v44, 49 +; SI-NEXT: v_readlane_b32 s22, v44, 47 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s21, s21, 8 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: v_readlane_b32 s21, v43, 47 +; SI-NEXT: v_readlane_b32 s21, v44, 48 ; SI-NEXT: s_and_b32 s22, s22, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s21, s21, 24 @@ -155116,16 +155927,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s21, s21, s22 ; SI-NEXT: s_or_b32 s20, s21, s20 ; SI-NEXT: s_add_i32 s21, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v43, 43 +; SI-NEXT: v_readlane_b32 s20, v44, 43 ; SI-NEXT: s_add_i32 s45, s23, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s22, v43, 42 -; SI-NEXT: v_readlane_b32 s23, v43, 44 +; SI-NEXT: v_readlane_b32 s22, v44, 42 +; SI-NEXT: v_readlane_b32 s23, v44, 45 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s22, s22, 8 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: v_readlane_b32 s22, v43, 45 +; SI-NEXT: v_readlane_b32 s22, v44, 46 ; SI-NEXT: s_and_b32 s23, s23, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s22, s22, 24 @@ -155134,15 +155945,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: s_or_b32 s20, s22, s20 ; SI-NEXT: s_add_i32 s22, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v43, 41 +; SI-NEXT: v_readlane_b32 s20, v44, 41 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s23, v43, 40 -; SI-NEXT: v_readlane_b32 s24, v43, 38 +; SI-NEXT: v_readlane_b32 s23, v44, 40 +; SI-NEXT: v_readlane_b32 s24, v44, 38 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s23, s23, 8 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_or_b32 s20, s23, s20 -; SI-NEXT: v_readlane_b32 s23, v43, 39 +; SI-NEXT: v_readlane_b32 s23, v44, 39 ; SI-NEXT: s_and_b32 s24, s24, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s23, s23, 24 @@ -155151,361 +155962,367 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s23, s23, s24 ; SI-NEXT: s_or_b32 s20, s23, s20 ; SI-NEXT: s_add_i32 s23, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v43, 37 +; SI-NEXT: v_readlane_b32 s20, v44, 37 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s24, v43, 36 -; SI-NEXT: v_readlane_b32 s25, v43, 34 +; SI-NEXT: v_readlane_b32 s24, v44, 36 +; SI-NEXT: v_readlane_b32 s25, v44, 34 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s24, s24, 8 ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v43, 35 +; SI-NEXT: v_readlane_b32 s24, v44, 35 ; SI-NEXT: s_and_b32 s25, s25, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s24, s24, 24 ; SI-NEXT: s_lshl_b32 s25, s25, 16 ; SI-NEXT: s_and_b32 s20, s20, 0xffff ; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: s_and_b32 s46, s46, 0xff ; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v43, 3 -; SI-NEXT: s_lshl_b32 s46, s46, 16 -; SI-NEXT: s_addk_i32 s56, 0x300 +; SI-NEXT: v_readlane_b32 s24, v44, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_readlane_b32 s25, v43, 2 -; SI-NEXT: v_readlane_b32 s26, v43, 1 -; SI-NEXT: s_or_b32 s46, s47, s46 -; SI-NEXT: s_and_b32 s47, s56, 0xffff -; SI-NEXT: s_add_i32 s7, s7, 0x3000000 -; SI-NEXT: s_add_i32 s9, s9, 0x3000000 +; SI-NEXT: v_readlane_b32 s25, v44, 2 +; SI-NEXT: v_readlane_b32 s26, v44, 1 ; SI-NEXT: s_and_b32 s24, s24, 0xff ; SI-NEXT: s_lshl_b32 s25, s25, 8 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s56, s46, s47 -; SI-NEXT: s_add_i32 s47, s58, 0x3000000 -; SI-NEXT: s_add_i32 s58, s59, 0x3000000 -; SI-NEXT: s_add_i32 s10, s10, 0x3000000 ; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: v_readlane_b32 s25, v43, 0 +; SI-NEXT: v_readlane_b32 s25, v44, 0 ; SI-NEXT: s_and_b32 s26, s26, 0xff -; SI-NEXT: s_and_b32 s73, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s59, s9, 16 -; SI-NEXT: s_and_b32 s9, s7, 0xffff0000 -; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s13, s13, 0x3000000 ; SI-NEXT: s_addk_i32 s24, 0x300 ; SI-NEXT: s_lshl_b32 s25, s25, 24 ; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: s_and_b32 s63, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s79, s17, 16 -; SI-NEXT: v_writelane_b32 v42, s9, 50 -; SI-NEXT: s_lshl_b32 s17, s7, 16 -; SI-NEXT: s_lshl_b32 s7, s10, 16 -; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: s_add_i32 s9, s9, 0x3000000 +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_add_i32 s18, s18, 0x3000000 ; SI-NEXT: s_and_b32 s24, s24, 0xffff ; SI-NEXT: s_or_b32 s25, s25, s26 -; SI-NEXT: v_writelane_b32 v42, s7, 51 -; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_and_b32 s89, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s17, 16 +; SI-NEXT: s_and_b32 s17, s13, 0xffff0000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 ; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: v_writelane_b32 v42, s7, 52 +; SI-NEXT: s_and_b32 s74, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s18, 16 +; SI-NEXT: v_writelane_b32 v43, s17, 49 +; SI-NEXT: s_and_b32 s63, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s11, 16 +; SI-NEXT: s_and_b32 s11, s9, 0xffff0000 +; SI-NEXT: s_and_b32 s46, s46, 0xff +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s11, 50 +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_and_b32 s9, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_addk_i32 s56, 0x300 +; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s9, 51 +; SI-NEXT: s_lshl_b32 s17, s7, 16 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_or_b32 s46, s47, s46 +; SI-NEXT: s_and_b32 s47, s56, 0xffff +; SI-NEXT: v_writelane_b32 v43, s7, 52 ; SI-NEXT: s_and_b32 s7, s8, 0xffff0000 +; SI-NEXT: s_or_b32 s56, s46, s47 +; SI-NEXT: s_add_i32 s47, s58, 0x3000000 +; SI-NEXT: s_add_i32 s58, s59, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s7, 53 +; SI-NEXT: s_lshl_b32 s7, s8, 16 +; SI-NEXT: s_add_i32 s57, s57, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s7, 54 +; SI-NEXT: s_and_b32 s7, s58, 0xffff0000 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s46, s60, 0x3000000 ; SI-NEXT: s_add_i32 s56, s56, 0x3000000 -; SI-NEXT: s_add_i32 s57, s57, 0x3000000 -; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 ; SI-NEXT: s_add_i32 s12, s12, 0x3000000 -; SI-NEXT: s_add_i32 s13, s13, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 0x3000000 -; SI-NEXT: s_add_i32 s18, s18, 0x3000000 ; SI-NEXT: s_add_i32 s19, s19, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 0x3000000 ; SI-NEXT: s_add_i32 s24, s24, 0x3000000 -; SI-NEXT: v_writelane_b32 v42, s7, 53 -; SI-NEXT: s_lshl_b32 s7, s8, 16 +; SI-NEXT: v_writelane_b32 v43, s7, 55 +; SI-NEXT: s_and_b32 s7, s57, 0xffff0000 ; SI-NEXT: s_and_b32 s27, s24, 0xffff0000 ; SI-NEXT: s_lshl_b32 s26, s24, 16 -; SI-NEXT: s_and_b32 s24, s20, 0xffff0000 +; SI-NEXT: s_and_b32 s65, s20, 0xffff0000 ; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: s_and_b32 s35, s23, 0xffff0000 +; SI-NEXT: s_and_b32 s66, s23, 0xffff0000 ; SI-NEXT: s_lshl_b32 s29, s23, 16 -; SI-NEXT: s_and_b32 s90, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s74, s22, 16 -; SI-NEXT: s_and_b32 s25, s21, 0xffff0000 +; SI-NEXT: s_and_b32 s64, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s95, s22, 16 +; SI-NEXT: s_and_b32 s76, s21, 0xffff0000 ; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: s_and_b32 s75, s19, 0xffff0000 +; SI-NEXT: s_and_b32 s77, s19, 0xffff0000 ; SI-NEXT: s_lshl_b32 s22, s19, 16 -; SI-NEXT: s_and_b32 s61, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s76, s18, 16 -; SI-NEXT: s_and_b32 s77, s16, 0xffff0000 +; SI-NEXT: s_and_b32 s78, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_and_b32 s89, s13, 0xffff0000 ; SI-NEXT: s_lshl_b32 s19, s13, 16 -; SI-NEXT: s_and_b32 s13, s12, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s12, 16 -; SI-NEXT: s_and_b32 s60, s11, 0xffff0000 -; SI-NEXT: s_lshl_b32 s18, s11, 16 -; SI-NEXT: s_and_b32 s23, s10, 0xffff0000 +; SI-NEXT: s_and_b32 s75, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s79, s12, 16 +; SI-NEXT: s_and_b32 s13, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s10, 16 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_writelane_b32 v42, s7, 54 -; SI-NEXT: s_and_b32 s72, s58, 0xffff0000 ; SI-NEXT: s_lshl_b32 s99, s58, 16 -; SI-NEXT: s_and_b32 s7, s57, 0xffff0000 +; SI-NEXT: v_writelane_b32 v43, s7, 56 ; SI-NEXT: s_lshl_b32 s57, s57, 16 -; SI-NEXT: s_and_b32 s49, s56, 0xffff0000 +; SI-NEXT: s_and_b32 s7, s56, 0xffff0000 ; SI-NEXT: s_lshl_b32 s8, s56, 16 -; SI-NEXT: s_and_b32 s51, s47, 0xffff0000 -; SI-NEXT: s_lshl_b32 s50, s47, 16 -; SI-NEXT: s_and_b32 s52, s46, 0xffff0000 +; SI-NEXT: s_and_b32 s56, s47, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s47, 16 +; SI-NEXT: s_and_b32 s47, s46, 0xffff0000 ; SI-NEXT: s_lshl_b32 s97, s46, 16 -; SI-NEXT: s_and_b32 s54, s45, 0xffff0000 -; SI-NEXT: s_lshl_b32 s53, s45, 16 -; SI-NEXT: s_and_b32 s55, s44, 0xffff0000 +; SI-NEXT: s_and_b32 s24, s45, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_and_b32 s58, s44, 0xffff0000 ; SI-NEXT: s_lshl_b32 s28, s44, 16 -; SI-NEXT: s_and_b32 s65, s43, 0xffff0000 -; SI-NEXT: s_lshl_b32 s64, s43, 16 -; SI-NEXT: s_and_b32 s66, s42, 0xffff0000 +; SI-NEXT: s_and_b32 s73, s43, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s43, 16 +; SI-NEXT: s_and_b32 s67, s42, 0xffff0000 ; SI-NEXT: s_lshl_b32 s87, s42, 16 ; SI-NEXT: s_and_b32 s68, s41, 0xffff0000 -; SI-NEXT: s_lshl_b32 s67, s41, 16 -; SI-NEXT: s_and_b32 s69, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s41, 16 +; SI-NEXT: s_and_b32 s70, s40, 0xffff0000 ; SI-NEXT: s_lshl_b32 s86, s40, 16 -; SI-NEXT: s_and_b32 s62, s15, 0xffff0000 -; SI-NEXT: s_lshl_b32 s70, s15, 16 -; SI-NEXT: s_and_b32 s80, s14, 0xffff0000 +; SI-NEXT: s_and_b32 s94, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s69, s15, 16 +; SI-NEXT: s_and_b32 s11, s14, 0xffff0000 ; SI-NEXT: s_lshl_b32 s85, s14, 16 -; SI-NEXT: s_and_b32 s92, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s5, 16 -; SI-NEXT: s_and_b32 s83, s4, 0xffff0000 +; SI-NEXT: s_and_b32 s31, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s80, s5, 16 +; SI-NEXT: s_and_b32 s15, s4, 0xffff0000 ; SI-NEXT: s_lshl_b32 s84, s4, 16 -; SI-NEXT: v_writelane_b32 v42, s7, 55 +; SI-NEXT: v_writelane_b32 v43, s7, 57 ; SI-NEXT: .LBB89_3: ; %end ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s27 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s26 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_readlane_b32 s4, v43, 49 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s20 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s29 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s29 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s21 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s21 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s79 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s18 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: v_readlane_b32 s4, v43, 50 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s59 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 50 +; SI-NEXT: v_readlane_b32 s4, v43, 51 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 -; SI-NEXT: v_readlane_b32 s4, v42, 51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 52 +; SI-NEXT: v_readlane_b32 s4, v43, 52 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 53 +; SI-NEXT: v_readlane_b32 s4, v43, 53 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_readlane_b32 s4, v42, 54 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_readlane_b32 s4, v43, 54 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: v_readlane_b32 s4, v43, 55 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s99 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 55 +; SI-NEXT: v_readlane_b32 s4, v43, 56 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s57 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: v_readlane_b32 s4, v43, 57 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s8 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s8 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s97 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s53 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s87 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s87 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s67 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s42 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s86 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s70 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s86 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s80 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s85 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s85 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s11 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s80 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s83 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s84 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s84 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload @@ -155549,6 +156366,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -155559,99 +156377,109 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: v_readlane_b32 s58, v43, 19 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: s_mov_b32 s95, s47 -; SI-NEXT: s_mov_b32 s94, s21 -; SI-NEXT: s_mov_b32 s93, s61 -; SI-NEXT: s_mov_b32 s34, s73 -; SI-NEXT: s_mov_b32 s91, s75 -; SI-NEXT: v_readlane_b32 s56, v43, 10 -; SI-NEXT: s_mov_b32 s36, s63 -; SI-NEXT: s_mov_b32 s38, s59 -; SI-NEXT: s_mov_b32 s37, s42 -; SI-NEXT: v_readlane_b32 s30, v43, 17 -; SI-NEXT: v_readlane_b32 s98, v43, 6 -; SI-NEXT: s_mov_b32 s46, s45 -; SI-NEXT: s_mov_b32 s31, s43 -; SI-NEXT: s_mov_b32 s78, s40 -; SI-NEXT: v_readlane_b32 s15, v43, 14 -; SI-NEXT: s_mov_b32 s39, s57 -; SI-NEXT: s_mov_b32 s48, s13 -; SI-NEXT: v_readlane_b32 s41, v43, 13 -; SI-NEXT: v_readlane_b32 s44, v43, 5 -; SI-NEXT: v_readlane_b32 s9, v43, 11 -; SI-NEXT: v_readlane_b32 s14, v43, 12 -; SI-NEXT: v_readlane_b32 s81, v43, 9 -; SI-NEXT: v_readlane_b32 s10, v43, 16 -; SI-NEXT: v_readlane_b32 s12, v43, 4 -; SI-NEXT: v_readlane_b32 s96, v43, 7 -; SI-NEXT: v_readlane_b32 s82, v43, 8 -; SI-NEXT: v_readlane_b32 s71, v43, 15 +; SI-NEXT: ; kill: killed $sgpr8 +; SI-NEXT: v_readlane_b32 s92, v44, 24 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: v_readlane_b32 s91, v44, 20 +; SI-NEXT: s_mov_b32 s90, s88 +; SI-NEXT: v_readlane_b32 s36, v44, 23 +; SI-NEXT: v_readlane_b32 s35, v44, 19 +; SI-NEXT: v_readlane_b32 s62, v44, 22 +; SI-NEXT: v_readlane_b32 s38, v44, 18 +; SI-NEXT: s_mov_b32 s34, s46 +; SI-NEXT: s_mov_b32 s93, s21 +; SI-NEXT: s_mov_b32 s37, s43 +; SI-NEXT: s_mov_b32 s39, s75 +; SI-NEXT: v_readlane_b32 s72, v44, 10 +; SI-NEXT: s_mov_b32 s50, s63 +; SI-NEXT: s_mov_b32 s51, s59 +; SI-NEXT: s_mov_b32 s48, s56 +; SI-NEXT: v_readlane_b32 s30, v44, 21 +; SI-NEXT: s_mov_b32 s49, s61 +; SI-NEXT: s_mov_b32 s52, s79 +; SI-NEXT: v_readlane_b32 s98, v44, 6 +; SI-NEXT: s_mov_b32 s55, s45 +; SI-NEXT: v_readlane_b32 s43, v44, 17 +; SI-NEXT: s_mov_b32 s60, s40 +; SI-NEXT: v_readlane_b32 s41, v44, 14 +; SI-NEXT: s_mov_b32 s53, s42 +; SI-NEXT: s_mov_b32 s54, s13 +; SI-NEXT: v_readlane_b32 s14, v44, 13 +; SI-NEXT: v_readlane_b32 s44, v44, 5 +; SI-NEXT: v_readlane_b32 s9, v44, 11 +; SI-NEXT: v_readlane_b32 s81, v44, 12 +; SI-NEXT: v_readlane_b32 s82, v44, 9 +; SI-NEXT: v_readlane_b32 s10, v44, 16 +; SI-NEXT: v_readlane_b32 s12, v44, 4 +; SI-NEXT: v_readlane_b32 s96, v44, 7 +; SI-NEXT: v_readlane_b32 s83, v44, 8 +; SI-NEXT: v_readlane_b32 s71, v44, 15 ; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr65 ; SI-NEXT: ; implicit-def: $sgpr29 -; SI-NEXT: ; implicit-def: $sgpr35 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: ; implicit-def: $sgpr21 -; SI-NEXT: ; implicit-def: $sgpr25 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $sgpr79 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr19 -; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr99 -; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; kill: killed $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; kill: killed $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr97 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr73 ; SI-NEXT: ; implicit-def: $sgpr87 -; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr69 ; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr85 -; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr31 ; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: s_branch .LBB89_2 ; ; VI-LABEL: bitcast_v128i8_to_v64bf16_scalar: @@ -166896,40 +167724,43 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 @@ -166962,1627 +167793,1841 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_writelane_b32 v63, s81, 25 ; SI-NEXT: v_writelane_b32 v63, s82, 26 ; SI-NEXT: v_writelane_b32 v63, s83, 27 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 ; SI-NEXT: v_writelane_b32 v63, s84, 28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9 ; SI-NEXT: v_writelane_b32 v63, s85, 29 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 ; SI-NEXT: v_writelane_b32 v63, s86, 30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 ; SI-NEXT: v_writelane_b32 v63, s87, 31 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v10 ; SI-NEXT: v_writelane_b32 v63, s96, 32 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26 ; SI-NEXT: v_writelane_b32 v63, s97, 33 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: v_writelane_b32 v63, s98, 34 -; SI-NEXT: v_mov_b32_e32 v46, v21 ; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v11 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v23 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v38 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v33 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v48 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v34 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v54 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v51 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v54 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v55 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v59 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 ; SI-NEXT: v_mul_f32_e32 v38, 1.0, v40 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v42 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v43 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v44 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v45 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v41 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v43 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v44 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v45 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v42, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v43, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v44, 1.0, s28 ; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB91_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; SI-NEXT: v_mov_b32_e32 v42, v37 -; SI-NEXT: v_alignbit_b32 v37, v2, v11, 16 -; SI-NEXT: v_alignbit_b32 v11, v44, v4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: v_readfirstlane_b32 s5, v11 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_writelane_b32 v62, s6, 0 -; SI-NEXT: v_alignbit_b32 v2, v2, v15, 16 -; SI-NEXT: v_writelane_b32 v62, s7, 1 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v14, v52, v6, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s5, v14 -; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v19, v2, v8, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_readfirstlane_b32 s5, v19 -; SI-NEXT: v_alignbit_b32 v2, v2, v25, 16 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v56 -; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[18:19], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: v_alignbit_b32 v47, v45, v47, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_readfirstlane_b32 s5, v47 -; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v58 -; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[28:29], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: v_mov_b32_e32 v4, v58 -; SI-NEXT: v_alignbit_b32 v58, v8, v41, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_readfirstlane_b32 s5, v58 -; SI-NEXT: v_alignbit_b32 v2, v2, v61, 16 -; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v2, v2, v60, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v23, v22 -; SI-NEXT: v_mov_b32_e32 v40, v36 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v56 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_alignbit_b32 v41, v15, v6, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v41 -; SI-NEXT: s_lshr_b64 s[42:43], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[4:5], 8 -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; SI-NEXT: v_alignbit_b32 v59, v1, v13, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s5, v59 -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[4:5], 8 -; SI-NEXT: v_alignbit_b32 v61, v1, v17, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v58 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v2, v2, v21, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_alignbit_b32 v2, v2, v12, 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_alignbit_b32 v60, v2, v20, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_alignbit_b32 v1, v2, v46, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v60 -; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[88:89], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[4:5], 8 +; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v52 +; SI-NEXT: s_lshr_b32 s7, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v30 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: s_lshr_b64 s[86:87], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v54 +; SI-NEXT: s_lshr_b32 s65, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v41 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v42 +; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v45 +; SI-NEXT: s_lshr_b32 s69, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v43 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v44 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: s_lshr_b32 s91, s4, 16 +; SI-NEXT: v_mov_b32_e32 v30, v51 +; SI-NEXT: v_readfirstlane_b32 s4, v47 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v51 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: s_lshr_b64 s[52:53], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: s_lshr_b32 s37, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, v5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v23 -; SI-NEXT: v_mov_b32_e32 v5, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v47 -; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v41 -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v61 -; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v60 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v25, v2, v26, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57 -; SI-NEXT: v_readfirstlane_b32 s5, v25 -; SI-NEXT: v_alignbit_b32 v2, v2, v16, 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[94:95], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[34:35], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v22, v2, v30, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_readfirstlane_b32 s5, v22 -; SI-NEXT: v_alignbit_b32 v2, v2, v27, 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_alignbit_b32 v17, v2, v36, 16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_readfirstlane_b32 s5, v17 -; SI-NEXT: v_alignbit_b32 v2, v2, v34, 16 -; SI-NEXT: s_lshr_b64 s[48:49], s[4:5], 24 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_mov_b32_e32 v41, v5 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: v_mov_b32_e32 v5, v39 +; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: s_lshr_b32 s89, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: s_lshr_b64 s[50:51], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[52:53], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v29, v37 -; SI-NEXT: v_mov_b32_e32 v37, v42 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; SI-NEXT: s_lshr_b32 s57, s4, 16 +; SI-NEXT: v_mov_b32_e32 v42, v32 +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: v_mov_b32_e32 v6, v55 +; SI-NEXT: s_lshr_b64 s[92:93], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: s_lshr_b32 s79, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v20 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v39 +; SI-NEXT: v_mov_b32_e32 v9, v8 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: v_mov_b32_e32 v20, v21 +; SI-NEXT: v_readfirstlane_b32 s78, v18 +; SI-NEXT: s_lshr_b32 s73, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v20 +; SI-NEXT: v_mov_b32_e32 v18, v22 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v18 +; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v23 +; SI-NEXT: s_lshr_b32 s59, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v28 +; SI-NEXT: v_mov_b32_e32 v21, v25 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v27 +; SI-NEXT: v_mov_b32_e32 v25, v26 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v25 +; SI-NEXT: v_mov_b32_e32 v12, v29 +; SI-NEXT: s_lshr_b32 s45, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_mov_b32_e32 v1, v52 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v29 +; SI-NEXT: s_lshr_b32 s29, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v40 +; SI-NEXT: s_lshr_b64 s[96:97], s[6:7], 16 +; SI-NEXT: s_mov_b32 s9, s96 +; SI-NEXT: v_readfirstlane_b32 s88, v60 +; SI-NEXT: s_lshr_b64 s[82:83], s[88:89], 16 +; SI-NEXT: v_readfirstlane_b32 s64, v16 +; SI-NEXT: s_lshr_b64 s[84:85], s[64:65], 16 +; SI-NEXT: s_mov_b32 s87, s84 +; SI-NEXT: v_readfirstlane_b32 s68, v48 +; SI-NEXT: s_lshr_b64 s[70:71], s[68:69], 16 +; SI-NEXT: s_mov_b32 s81, s70 +; SI-NEXT: v_readfirstlane_b32 s90, v30 +; SI-NEXT: s_lshr_b64 s[38:39], s[90:91], 16 +; SI-NEXT: s_mov_b32 s67, s38 +; SI-NEXT: v_readfirstlane_b32 s36, v3 +; SI-NEXT: s_lshr_b64 s[98:99], s[36:37], 16 +; SI-NEXT: s_mov_b32 s53, s98 +; SI-NEXT: s_mov_b32 s31, s82 +; SI-NEXT: v_readfirstlane_b32 s56, v7 +; SI-NEXT: s_lshr_b64 s[94:95], s[56:57], 16 +; SI-NEXT: s_mov_b32 s51, s94 +; SI-NEXT: s_lshr_b64 s[74:75], s[78:79], 16 +; SI-NEXT: s_mov_b32 s93, s74 +; SI-NEXT: v_readfirstlane_b32 s72, v19 +; SI-NEXT: s_lshr_b64 s[60:61], s[72:73], 16 +; SI-NEXT: s_mov_b32 s77, s60 +; SI-NEXT: v_readfirstlane_b32 s58, v21 +; SI-NEXT: s_lshr_b64 s[54:55], s[58:59], 16 +; SI-NEXT: s_mov_b32 s63, s54 +; SI-NEXT: v_readfirstlane_b32 s44, v22 +; SI-NEXT: s_lshr_b64 s[42:43], s[44:45], 16 +; SI-NEXT: s_mov_b32 s47, s42 +; SI-NEXT: v_mov_b32_e32 v26, v37 +; SI-NEXT: v_readfirstlane_b32 s28, v26 +; SI-NEXT: s_lshr_b64 s[26:27], s[28:29], 16 +; SI-NEXT: s_mov_b32 s41, s26 +; SI-NEXT: v_readfirstlane_b32 s22, v36 +; SI-NEXT: v_readfirstlane_b32 s18, v49 +; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v1 +; SI-NEXT: v_mov_b32_e32 v1, v56 +; SI-NEXT: v_mov_b32_e32 v3, v54 +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v50 +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v38 +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v54, v59 +; SI-NEXT: s_lshr_b32 s78, s96, 8 +; SI-NEXT: s_lshr_b32 s61, s84, 8 +; SI-NEXT: s_lshr_b32 s72, s70, 8 +; SI-NEXT: s_lshr_b32 s75, s38, 8 +; SI-NEXT: s_lshr_b32 s58, s98, 8 +; SI-NEXT: s_lshr_b32 s43, s82, 8 +; SI-NEXT: s_lshr_b32 s44, s94, 8 +; SI-NEXT: s_mov_b32 s64, s74 +; SI-NEXT: s_lshr_b32 s27, s74, 8 +; SI-NEXT: s_mov_b32 s90, s60 +; SI-NEXT: s_lshr_b32 s28, s60, 8 +; SI-NEXT: s_lshr_b32 s74, s54, 8 +; SI-NEXT: s_mov_b32 s68, s42 +; SI-NEXT: s_mov_b32 s56, s26 ; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v11 -; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v14 -; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v19 -; SI-NEXT: v_lshrrev_b32_e32 v4, 8, v59 -; SI-NEXT: v_lshrrev_b32_e32 v35, 24, v43 -; SI-NEXT: v_mov_b32_e32 v31, v20 -; SI-NEXT: v_mov_b32_e32 v20, v34 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_alignbit_b32 v30, v2, v36, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_alignbit_b32 v2, v2, v39, 16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s5, v30 -; SI-NEXT: s_lshr_b64 s[54:55], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[64:65], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[68:69], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v28, v36 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v57, v2, v39, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_readfirstlane_b32 s5, v57 -; SI-NEXT: v_alignbit_b32 v2, v2, v50, 16 -; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[70:71], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[82:83], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v46, v2, v38, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_readfirstlane_b32 s5, v46 -; SI-NEXT: v_alignbit_b32 v2, v2, v54, 16 -; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[84:85], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[96:97], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v57 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v38, v2, v53, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v49 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v46 -; SI-NEXT: v_readfirstlane_b32 s5, v38 -; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; SI-NEXT: s_lshr_b64 s[86:87], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[98:99], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 8 -; SI-NEXT: v_mov_b32_e32 v32, v8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v18, 8, v25 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v30 -; SI-NEXT: v_mov_b32_e32 v55, v49 -; SI-NEXT: v_mov_b32_e32 v49, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v45 +; SI-NEXT: v_lshrrev_b32_e32 v13, 24, v34 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v35 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v4, 24, v25 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v57 +; SI-NEXT: s_lshr_b32 s23, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v58 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v59 +; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: s_lshr_b32 s19, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v53 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s4, v24 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: s_lshr_b32 s13, s4, 16 +; SI-NEXT: s_mov_b32 s5, s13 +; SI-NEXT: v_writelane_b32 v61, s4, 26 +; SI-NEXT: v_writelane_b32 v61, s5, 27 +; SI-NEXT: v_readfirstlane_b32 s4, v46 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s5, v56 +; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16 +; SI-NEXT: s_lshr_b32 s13, s5, 16 +; SI-NEXT: v_readfirstlane_b32 s12, v14 +; SI-NEXT: s_lshr_b64 vcc, s[12:13], 16 +; SI-NEXT: s_mov_b32 s5, vcc_lo +; SI-NEXT: s_mov_b32 s88, vcc_lo +; SI-NEXT: s_lshr_b32 s6, vcc_lo, 8 +; SI-NEXT: s_lshr_b64 vcc, s[8:9], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 4 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 5 +; SI-NEXT: s_lshr_b64 vcc, s[8:9], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 2 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 3 +; SI-NEXT: s_lshr_b64 vcc, s[8:9], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 0 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 1 +; SI-NEXT: s_lshr_b64 vcc, s[86:87], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 10 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 11 +; SI-NEXT: s_lshr_b64 vcc, s[86:87], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 8 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 9 +; SI-NEXT: s_lshr_b64 vcc, s[86:87], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 6 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 7 +; SI-NEXT: s_lshr_b64 vcc, s[80:81], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 16 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 17 +; SI-NEXT: s_lshr_b64 vcc, s[80:81], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 14 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 15 +; SI-NEXT: s_lshr_b64 vcc, s[80:81], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 12 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 13 +; SI-NEXT: s_lshr_b64 vcc, s[66:67], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 22 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 23 +; SI-NEXT: s_lshr_b64 vcc, s[66:67], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 20 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 21 +; SI-NEXT: s_lshr_b64 vcc, s[66:67], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 18 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 19 +; SI-NEXT: s_lshr_b64 vcc, s[52:53], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 28 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 29 +; SI-NEXT: s_lshr_b64 vcc, s[52:53], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 26 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 27 +; SI-NEXT: s_lshr_b64 vcc, s[52:53], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 24 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 25 +; SI-NEXT: s_lshr_b64 vcc, s[30:31], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 34 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 35 +; SI-NEXT: s_lshr_b64 vcc, s[30:31], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 32 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 33 +; SI-NEXT: s_lshr_b64 vcc, s[30:31], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 30 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 31 +; SI-NEXT: s_lshr_b64 vcc, s[50:51], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 40 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 41 +; SI-NEXT: s_lshr_b64 vcc, s[50:51], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 38 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 39 +; SI-NEXT: s_lshr_b64 vcc, s[50:51], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 36 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 37 +; SI-NEXT: s_lshr_b64 vcc, s[92:93], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 46 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 47 +; SI-NEXT: s_lshr_b64 vcc, s[92:93], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 44 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 45 +; SI-NEXT: s_lshr_b64 vcc, s[92:93], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 42 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 43 +; SI-NEXT: s_lshr_b64 vcc, s[76:77], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 52 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 53 +; SI-NEXT: s_lshr_b64 vcc, s[76:77], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 50 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 51 +; SI-NEXT: s_lshr_b64 vcc, s[76:77], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 48 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 49 +; SI-NEXT: s_lshr_b64 vcc, s[62:63], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 58 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 59 +; SI-NEXT: s_lshr_b64 vcc, s[62:63], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 56 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 57 +; SI-NEXT: s_lshr_b64 vcc, s[62:63], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 54 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 55 +; SI-NEXT: s_lshr_b64 vcc, s[46:47], 24 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 0 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 1 +; SI-NEXT: s_lshr_b64 vcc, s[46:47], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 62 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 63 +; SI-NEXT: s_lshr_b64 vcc, s[46:47], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 60 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 61 +; SI-NEXT: s_lshr_b64 vcc, s[40:41], 24 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 6 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 7 +; SI-NEXT: s_lshr_b64 vcc, s[40:41], 16 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 4 +; SI-NEXT: s_lshr_b64 s[34:35], s[22:23], 16 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 5 +; SI-NEXT: s_lshr_b64 vcc, s[40:41], 8 +; SI-NEXT: s_mov_b32 s25, s34 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 2 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 3 +; SI-NEXT: s_lshr_b64 vcc, s[24:25], 24 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 12 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 13 +; SI-NEXT: s_lshr_b64 vcc, s[24:25], 16 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 10 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 11 +; SI-NEXT: s_lshr_b64 vcc, s[24:25], 8 +; SI-NEXT: s_mov_b32 s17, s14 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 8 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 9 +; SI-NEXT: s_lshr_b64 vcc, s[16:17], 24 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 18 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 19 +; SI-NEXT: s_lshr_b64 vcc, s[16:17], 16 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 16 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 17 +; SI-NEXT: s_lshr_b64 vcc, s[16:17], 8 +; SI-NEXT: s_mov_b32 s11, s20 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 14 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 15 +; SI-NEXT: s_lshr_b64 vcc, s[10:11], 24 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 24 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 25 +; SI-NEXT: s_lshr_b64 vcc, s[10:11], 16 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 22 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 23 +; SI-NEXT: s_lshr_b64 vcc, s[10:11], 8 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 20 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 21 +; SI-NEXT: s_lshr_b64 vcc, s[4:5], 24 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 32 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 33 +; SI-NEXT: s_lshr_b64 vcc, s[4:5], 16 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 30 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 31 +; SI-NEXT: s_lshr_b64 vcc, s[4:5], 8 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 28 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v10, 24, v23 +; SI-NEXT: s_lshr_b32 s22, s42, 8 +; SI-NEXT: s_lshr_b32 s21, s26, 8 +; SI-NEXT: s_lshr_b32 s18, s34, 8 +; SI-NEXT: s_mov_b32 s36, s14 +; SI-NEXT: s_lshr_b32 s15, s14, 8 +; SI-NEXT: s_mov_b32 s14, s20 +; SI-NEXT: s_lshr_b32 s12, s20, 8 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 29 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v29 +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v57 +; SI-NEXT: v_mov_b32_e32 v59, v30 +; SI-NEXT: v_mov_b32_e32 v31, v51 +; SI-NEXT: v_mov_b32_e32 v60, v34 +; SI-NEXT: v_mov_b32_e32 v30, v39 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v39, v21 +; SI-NEXT: v_mov_b32_e32 v21, v20 +; SI-NEXT: v_mov_b32_e32 v34, v18 +; SI-NEXT: v_mov_b32_e32 v18, v37 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v7, v26 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v37, v17 +; SI-NEXT: v_mov_b32_e32 v51, v33 +; SI-NEXT: v_mov_b32_e32 v17, v9 +; SI-NEXT: v_mov_b32_e32 v9, v10 +; SI-NEXT: v_mov_b32_e32 v26, v25 ; SI-NEXT: s_branch .LBB91_3 ; SI-NEXT: .LBB91_2: -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v55, v49 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v54, v59 ; SI-NEXT: v_writelane_b32 v62, s4, 0 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: v_mov_b32_e32 v40, v36 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: v_writelane_b32 v62, s5, 1 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v59, v51 +; SI-NEXT: v_writelane_b32 v62, s4, 2 +; SI-NEXT: v_writelane_b32 v62, s5, 3 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v31, v46 +; SI-NEXT: v_writelane_b32 v62, s4, 4 +; SI-NEXT: v_writelane_b32 v62, s5, 5 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_writelane_b32 v62, s4, 6 +; SI-NEXT: v_writelane_b32 v62, s5, 7 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: v_writelane_b32 v62, s4, 8 +; SI-NEXT: v_writelane_b32 v62, s5, 9 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v7, v37 +; SI-NEXT: v_writelane_b32 v62, s4, 10 +; SI-NEXT: v_writelane_b32 v62, s5, 11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_writelane_b32 v62, s4, 12 +; SI-NEXT: v_writelane_b32 v62, s5, 13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: v_writelane_b32 v62, s5, 15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: v_writelane_b32 v62, s4, 16 +; SI-NEXT: v_writelane_b32 v62, s5, 17 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: v_writelane_b32 v62, s4, 18 +; SI-NEXT: v_writelane_b32 v62, s5, 19 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_writelane_b32 v62, s4, 20 +; SI-NEXT: v_writelane_b32 v62, s5, 21 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v42, v32 +; SI-NEXT: v_writelane_b32 v62, s4, 22 +; SI-NEXT: v_writelane_b32 v62, s5, 23 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v41, v5 +; SI-NEXT: v_writelane_b32 v62, s4, 24 +; SI-NEXT: v_writelane_b32 v62, s5, 25 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_mov_b64 vcc, -1 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_writelane_b32 v62, s4, 26 +; SI-NEXT: v_writelane_b32 v62, s5, 27 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v60, v35 +; SI-NEXT: v_writelane_b32 v62, s4, 28 +; SI-NEXT: v_writelane_b32 v62, s5, 29 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: v_writelane_b32 v62, s5, 1 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_writelane_b32 v62, s4, 30 +; SI-NEXT: v_writelane_b32 v62, s5, 31 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_writelane_b32 v62, s4, 32 +; SI-NEXT: v_writelane_b32 v62, s5, 33 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v30, v12 +; SI-NEXT: v_writelane_b32 v62, s4, 34 +; SI-NEXT: v_writelane_b32 v62, s5, 35 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v19, v39 +; SI-NEXT: v_writelane_b32 v62, s4, 36 +; SI-NEXT: v_writelane_b32 v62, s5, 37 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: v_mov_b32_e32 v39, v25 +; SI-NEXT: v_writelane_b32 v62, s4, 38 +; SI-NEXT: v_writelane_b32 v62, s5, 39 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v12, v29 +; SI-NEXT: v_writelane_b32 v62, s4, 40 +; SI-NEXT: v_writelane_b32 v62, s5, 41 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_writelane_b32 v62, s4, 42 +; SI-NEXT: v_writelane_b32 v62, s5, 43 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v6, v55 +; SI-NEXT: v_writelane_b32 v62, s4, 44 +; SI-NEXT: v_writelane_b32 v62, s5, 45 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v17, v8 +; SI-NEXT: v_writelane_b32 v62, s4, 46 +; SI-NEXT: v_writelane_b32 v62, s5, 47 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v29, v33 +; SI-NEXT: v_writelane_b32 v62, s4, 48 +; SI-NEXT: v_writelane_b32 v62, s5, 49 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr37 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr18 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr98 -; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_writelane_b32 v62, s4, 50 +; SI-NEXT: v_writelane_b32 v62, s5, 51 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 52 +; SI-NEXT: v_writelane_b32 v62, s5, 53 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 54 +; SI-NEXT: v_writelane_b32 v62, s5, 55 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 56 +; SI-NEXT: v_writelane_b32 v62, s5, 57 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 58 +; SI-NEXT: v_writelane_b32 v62, s5, 59 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 60 +; SI-NEXT: v_writelane_b32 v62, s5, 61 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 62 +; SI-NEXT: v_writelane_b32 v62, s5, 63 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 0 +; SI-NEXT: v_writelane_b32 v61, s5, 1 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 2 +; SI-NEXT: v_writelane_b32 v61, s5, 3 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 4 +; SI-NEXT: v_writelane_b32 v61, s5, 5 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 6 +; SI-NEXT: v_writelane_b32 v61, s5, 7 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 8 +; SI-NEXT: v_writelane_b32 v61, s5, 9 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 10 +; SI-NEXT: v_writelane_b32 v61, s5, 11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 12 +; SI-NEXT: v_writelane_b32 v61, s5, 13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 14 +; SI-NEXT: v_writelane_b32 v61, s5, 15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 16 +; SI-NEXT: v_writelane_b32 v61, s5, 17 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 18 +; SI-NEXT: v_writelane_b32 v61, s5, 19 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 20 +; SI-NEXT: v_writelane_b32 v61, s5, 21 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 22 +; SI-NEXT: v_writelane_b32 v61, s5, 23 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 24 +; SI-NEXT: v_writelane_b32 v61, s5, 25 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: v_writelane_b32 v61, s4, 26 +; SI-NEXT: v_writelane_b32 v61, s5, 27 +; SI-NEXT: v_writelane_b32 v61, s20, 28 +; SI-NEXT: v_writelane_b32 v61, s21, 29 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s20, 30 +; SI-NEXT: v_writelane_b32 v61, s21, 31 +; SI-NEXT: v_writelane_b32 v61, s88, 32 +; SI-NEXT: v_writelane_b32 v61, s89, 33 +; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: .LBB91_3: ; %Flow -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, vcc ; SI-NEXT: s_cbranch_vccnz .LBB91_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v10, v6, v4, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s52, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_readfirstlane_b32 s12, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v9, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s86, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 +; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: s_lshr_b32 s9, s6, 16 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v12, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s80, v12 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_readfirstlane_b32 s6, v8 +; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 16 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v54 +; SI-NEXT: s_mov_b32 s7, s9 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 +; SI-NEXT: v_writelane_b32 v61, s6, 26 +; SI-NEXT: s_lshr_b64 s[20:21], s[8:9], 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49 +; SI-NEXT: v_writelane_b32 v61, s7, 27 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: s_lshr_b32 s9, s6, 16 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v13, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s66, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v38, v4, v3, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v15 +; SI-NEXT: v_readfirstlane_b32 s18, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v37 +; SI-NEXT: s_lshr_b64 s[16:17], s[8:9], 16 +; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_lshr_b32 s19, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 16 +; SI-NEXT: s_mov_b32 s17, s26 +; SI-NEXT: s_mov_b32 s11, s20 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_readfirstlane_b32 s22, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v15, v7, v6, 16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v42 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v46, v6, v3, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; SI-NEXT: v_readfirstlane_b32 s6, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v53 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v57, v6, v3, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_lshr_b32 s23, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: v_readfirstlane_b32 s28, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v27 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v16, v7, v6, 16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v30, v6, v3, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v21 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v28 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; SI-NEXT: v_readfirstlane_b32 s6, v5 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 -; SI-NEXT: v_alignbit_b32 v18, v9, v7, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v20, v10, v9, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s38, v15 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s90, v16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s30, v23 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s76, v18 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s62, v20 -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s5, v38 -; SI-NEXT: v_readfirstlane_b32 s87, v46 -; SI-NEXT: v_readfirstlane_b32 s81, v57 -; SI-NEXT: v_readfirstlane_b32 s67, v30 -; SI-NEXT: s_lshr_b64 s[54:55], s[66:67], 24 -; SI-NEXT: s_lshr_b64 s[64:65], s[66:67], 16 -; SI-NEXT: s_lshr_b64 s[68:69], s[66:67], 8 -; SI-NEXT: s_lshr_b64 s[66:67], s[80:81], 24 -; SI-NEXT: s_lshr_b64 s[70:71], s[80:81], 16 -; SI-NEXT: s_lshr_b64 s[82:83], s[80:81], 8 -; SI-NEXT: s_lshr_b64 s[80:81], s[86:87], 24 -; SI-NEXT: s_lshr_b64 s[84:85], s[86:87], 16 -; SI-NEXT: s_lshr_b64 s[96:97], s[86:87], 8 -; SI-NEXT: s_lshr_b64 s[86:87], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[98:99], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 8 -; SI-NEXT: v_lshrrev_b32_e32 v35, 24, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v30 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v17, v7, v3, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s53, v17 -; SI-NEXT: s_lshr_b64 s[48:49], s[52:53], 24 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v21, v12, v10, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_readfirstlane_b32 s56, v21 -; SI-NEXT: s_lshr_b64 s[50:51], s[52:53], 16 -; SI-NEXT: s_lshr_b64 s[52:53], s[52:53], 8 -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: s_lshr_b32 s29, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v7 +; SI-NEXT: v_readfirstlane_b32 s44, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v34 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v39 +; SI-NEXT: v_readfirstlane_b32 s6, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_alignbit_b32 v22, v9, v3, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31 +; SI-NEXT: s_lshr_b32 s45, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: v_readfirstlane_b32 s58, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_lshr_b64 s[62:63], s[8:9], 16 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s42, v23 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v57 -; SI-NEXT: v_readfirstlane_b32 s39, v22 +; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: s_lshr_b32 s59, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v10 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[8:9], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v42 +; SI-NEXT: v_readfirstlane_b32 s6, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: s_lshr_b32 s73, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v11 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v18 +; SI-NEXT: s_lshr_b32 s79, s6, 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[58:59], 16 +; SI-NEXT: s_mov_b32 s63, s54 +; SI-NEXT: s_lshr_b64 s[60:61], s[44:45], 16 +; SI-NEXT: s_mov_b32 s47, s60 +; SI-NEXT: s_lshr_b64 s[42:43], s[28:29], 16 +; SI-NEXT: s_mov_b32 s41, s42 +; SI-NEXT: s_lshr_b64 s[34:35], s[22:23], 16 +; SI-NEXT: s_mov_b32 s25, s34 +; SI-NEXT: v_readfirstlane_b32 s5, v14 +; SI-NEXT: s_lshr_b32 s13, s5, 16 +; SI-NEXT: s_lshr_b64 vcc, s[12:13], 16 +; SI-NEXT: s_mov_b32 s5, vcc_lo +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v8 +; SI-NEXT: s_lshr_b32 s22, s60, 8 +; SI-NEXT: s_lshr_b32 s21, s42, 8 +; SI-NEXT: s_lshr_b32 s18, s34, 8 +; SI-NEXT: s_lshr_b32 s12, s20, 8 +; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v18 +; SI-NEXT: v_lshrrev_b32_e32 v55, 24, v10 ; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v46 -; SI-NEXT: s_lshr_b64 s[36:37], s[38:39], 16 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v24, v15, v13, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v25, v10, v3, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s31, v25 -; SI-NEXT: v_readfirstlane_b32 s26, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_lshr_b64 s[94:95], s[30:31], 16 -; SI-NEXT: s_lshr_b64 s[34:35], s[30:31], 8 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_readfirstlane_b32 s72, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s20, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v27, v18, v16, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v60, v12, v3, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s91, v60 -; SI-NEXT: v_readfirstlane_b32 s14, v27 -; SI-NEXT: v_lshrrev_b32_e32 v10, 24, v10 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_lshr_b64 s[88:89], s[90:91], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[90:91], 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v29, v20, v18, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s8, v29 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b64 s[92:93], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[72:73], 16 +; SI-NEXT: s_mov_b32 s77, s74 +; SI-NEXT: s_lshr_b32 s28, s74, 8 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_alignbit_b32 v61, v11, v3, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s77, v61 -; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v12 -; SI-NEXT: s_lshr_b64 s[74:75], s[76:77], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[76:77], 8 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v61 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v13, 24, v13 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v18 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s78, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b64 s[48:49], s[78:79], 16 +; SI-NEXT: s_mov_b32 s93, s48 +; SI-NEXT: s_lshr_b32 s27, s48, 8 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v59, v36, v3, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s63, v59 -; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[62:63], 8 -; SI-NEXT: v_lshrrev_b32_e32 v4, 8, v59 -; SI-NEXT: v_lshrrev_b32_e32 v18, 8, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v20 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b64 s[50:51], s[8:9], 16 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v3 +; SI-NEXT: v_readfirstlane_b32 s56, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_alignbit_b32 v41, v49, v15, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s57, v41 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: s_lshr_b32 s57, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v19 +; SI-NEXT: s_lshr_b64 s[30:31], s[8:9], 16 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s6, v24 +; SI-NEXT: s_lshr_b32 s89, s6, 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[56:57], 16 +; SI-NEXT: s_mov_b32 s51, s94 +; SI-NEXT: s_lshr_b32 s44, s94, 8 +; SI-NEXT: s_mov_b32 s56, s42 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v24 ; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; SI-NEXT: s_lshr_b64 s[46:47], s[56:57], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[56:57], 8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v41 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_alignbit_b32 v58, v32, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s43, v58 -; SI-NEXT: s_lshr_b64 s[40:41], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[42:43], 8 -; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v15 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v58 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s88, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b64 s[82:83], s[88:89], 16 +; SI-NEXT: s_mov_b32 s31, s82 +; SI-NEXT: s_lshr_b32 s43, s82, 8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[8:9], 16 +; SI-NEXT: v_readfirstlane_b32 s6, v4 +; SI-NEXT: s_lshr_b32 s37, s6, 16 +; SI-NEXT: s_mov_b32 s88, vcc_lo ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v47, v45, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s27, v47 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v47 -; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[28:29], s[26:27], 8 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s36, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v59 +; SI-NEXT: s_lshr_b64 s[98:99], s[36:37], 16 +; SI-NEXT: s_mov_b32 s53, s98 +; SI-NEXT: s_lshr_b32 s58, s98, 8 +; SI-NEXT: s_mov_b32 s36, s26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s90, v13 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: s_lshr_b64 s[66:67], s[8:9], 16 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: s_lshr_b32 s91, s6, 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[90:91], 16 +; SI-NEXT: s_mov_b32 s67, s38 +; SI-NEXT: s_lshr_b32 s75, s38, 8 +; SI-NEXT: s_mov_b32 s90, s74 +; SI-NEXT: s_lshr_b32 s74, s54, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_readfirstlane_b32 s8, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v19, v11, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s21, v19 -; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v19 -; SI-NEXT: s_lshr_b64 s[18:19], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], 8 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b64 s[80:81], s[8:9], 16 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13 +; SI-NEXT: v_readfirstlane_b32 s68, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: s_lshr_b32 s69, s6, 16 +; SI-NEXT: s_lshr_b64 s[70:71], s[68:69], 16 +; SI-NEXT: s_mov_b32 s81, s70 +; SI-NEXT: s_lshr_b32 s72, s70, 8 +; SI-NEXT: s_mov_b32 s68, s60 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_readfirstlane_b32 s8, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_readfirstlane_b32 s64, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v14, v52, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s15, v14 -; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v14 -; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b64 s[86:87], s[8:9], 16 +; SI-NEXT: v_readfirstlane_b32 s6, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: s_lshr_b32 s65, s6, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: s_lshr_b64 s[84:85], s[64:65], 16 +; SI-NEXT: s_mov_b32 s87, s84 +; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v6 +; SI-NEXT: s_lshr_b32 s61, s84, 8 +; SI-NEXT: s_mov_b32 s64, s48 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_readfirstlane_b32 s8, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v11, v44, v16, 16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v60 -; SI-NEXT: v_readfirstlane_b32 s9, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v11 -; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 8 -; SI-NEXT: v_writelane_b32 v62, s6, 0 -; SI-NEXT: v_writelane_b32 v62, s7, 1 -; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[14:15], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[20:21], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[26:27], s[42:43], 24 -; SI-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 -; SI-NEXT: s_lshr_b64 s[56:57], s[62:63], 24 -; SI-NEXT: s_lshr_b64 s[62:63], s[76:77], 24 -; SI-NEXT: s_lshr_b64 s[76:77], s[90:91], 24 -; SI-NEXT: s_lshr_b64 s[90:91], s[30:31], 24 -; SI-NEXT: s_lshr_b64 s[30:31], s[38:39], 24 -; SI-NEXT: s_lshr_b64 s[38:39], s[38:39], 8 +; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v6 +; SI-NEXT: s_lshr_b32 s7, s6, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: s_lshr_b64 s[96:97], s[6:7], 16 +; SI-NEXT: s_mov_b32 s9, s96 +; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 4 +; SI-NEXT: v_writelane_b32 v62, s15, 5 +; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 2 +; SI-NEXT: v_writelane_b32 v62, s15, 3 +; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 0 +; SI-NEXT: v_writelane_b32 v62, s15, 1 +; SI-NEXT: s_lshr_b64 s[14:15], s[86:87], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 10 +; SI-NEXT: v_writelane_b32 v62, s15, 11 +; SI-NEXT: s_lshr_b64 s[14:15], s[86:87], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 8 +; SI-NEXT: v_writelane_b32 v62, s15, 9 +; SI-NEXT: s_lshr_b64 s[14:15], s[86:87], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 6 +; SI-NEXT: v_writelane_b32 v62, s15, 7 +; SI-NEXT: s_lshr_b64 s[14:15], s[80:81], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 16 +; SI-NEXT: v_writelane_b32 v62, s15, 17 +; SI-NEXT: s_lshr_b64 s[14:15], s[80:81], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 14 +; SI-NEXT: v_writelane_b32 v62, s15, 15 +; SI-NEXT: s_lshr_b64 s[14:15], s[80:81], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 12 +; SI-NEXT: v_writelane_b32 v62, s15, 13 +; SI-NEXT: s_lshr_b64 s[14:15], s[66:67], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 22 +; SI-NEXT: v_writelane_b32 v62, s15, 23 +; SI-NEXT: s_lshr_b64 s[14:15], s[66:67], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 20 +; SI-NEXT: v_writelane_b32 v62, s15, 21 +; SI-NEXT: s_lshr_b64 s[14:15], s[66:67], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 18 +; SI-NEXT: v_writelane_b32 v62, s15, 19 +; SI-NEXT: s_lshr_b64 s[14:15], s[52:53], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 28 +; SI-NEXT: v_writelane_b32 v62, s15, 29 +; SI-NEXT: s_lshr_b64 s[14:15], s[52:53], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 26 +; SI-NEXT: v_writelane_b32 v62, s15, 27 +; SI-NEXT: s_lshr_b64 s[14:15], s[52:53], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 24 +; SI-NEXT: v_writelane_b32 v62, s15, 25 +; SI-NEXT: s_lshr_b64 s[14:15], s[30:31], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 34 +; SI-NEXT: v_writelane_b32 v62, s15, 35 +; SI-NEXT: s_lshr_b64 s[14:15], s[30:31], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 32 +; SI-NEXT: v_writelane_b32 v62, s15, 33 +; SI-NEXT: s_lshr_b64 s[14:15], s[30:31], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 30 +; SI-NEXT: v_writelane_b32 v62, s15, 31 +; SI-NEXT: s_lshr_b64 s[14:15], s[50:51], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 40 +; SI-NEXT: v_writelane_b32 v62, s15, 41 +; SI-NEXT: s_lshr_b64 s[14:15], s[50:51], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 38 +; SI-NEXT: v_writelane_b32 v62, s15, 39 +; SI-NEXT: s_lshr_b64 s[14:15], s[50:51], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 36 +; SI-NEXT: v_writelane_b32 v62, s15, 37 +; SI-NEXT: s_lshr_b64 s[14:15], s[92:93], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 46 +; SI-NEXT: v_writelane_b32 v62, s15, 47 +; SI-NEXT: s_lshr_b64 s[14:15], s[92:93], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 44 +; SI-NEXT: v_writelane_b32 v62, s15, 45 +; SI-NEXT: s_lshr_b64 s[14:15], s[92:93], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 42 +; SI-NEXT: v_writelane_b32 v62, s15, 43 +; SI-NEXT: s_lshr_b64 s[14:15], s[76:77], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 52 +; SI-NEXT: v_writelane_b32 v62, s15, 53 +; SI-NEXT: s_lshr_b64 s[14:15], s[76:77], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 50 +; SI-NEXT: v_writelane_b32 v62, s15, 51 +; SI-NEXT: s_lshr_b64 s[14:15], s[76:77], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 48 +; SI-NEXT: v_writelane_b32 v62, s15, 49 +; SI-NEXT: s_lshr_b64 s[14:15], s[62:63], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 58 +; SI-NEXT: v_writelane_b32 v62, s15, 59 +; SI-NEXT: s_lshr_b64 s[14:15], s[62:63], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 56 +; SI-NEXT: v_writelane_b32 v62, s15, 57 +; SI-NEXT: s_lshr_b64 s[14:15], s[62:63], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 54 +; SI-NEXT: v_writelane_b32 v62, s15, 55 +; SI-NEXT: s_lshr_b64 s[14:15], s[46:47], 24 +; SI-NEXT: v_writelane_b32 v61, s14, 0 +; SI-NEXT: v_writelane_b32 v61, s15, 1 +; SI-NEXT: s_lshr_b64 s[14:15], s[46:47], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 62 +; SI-NEXT: v_writelane_b32 v62, s15, 63 +; SI-NEXT: s_lshr_b64 s[14:15], s[46:47], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 60 +; SI-NEXT: v_writelane_b32 v62, s15, 61 +; SI-NEXT: s_lshr_b64 s[14:15], s[40:41], 24 +; SI-NEXT: v_writelane_b32 v61, s14, 6 +; SI-NEXT: v_writelane_b32 v61, s15, 7 +; SI-NEXT: s_lshr_b64 s[14:15], s[40:41], 16 +; SI-NEXT: v_writelane_b32 v61, s14, 4 +; SI-NEXT: v_writelane_b32 v61, s15, 5 +; SI-NEXT: s_lshr_b64 s[14:15], s[40:41], 8 +; SI-NEXT: v_writelane_b32 v61, s14, 2 +; SI-NEXT: v_writelane_b32 v61, s15, 3 +; SI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 +; SI-NEXT: v_writelane_b32 v61, s14, 12 +; SI-NEXT: v_writelane_b32 v61, s15, 13 +; SI-NEXT: s_lshr_b64 s[14:15], s[24:25], 16 +; SI-NEXT: v_writelane_b32 v61, s14, 10 +; SI-NEXT: v_writelane_b32 v61, s15, 11 +; SI-NEXT: s_lshr_b64 s[14:15], s[24:25], 8 +; SI-NEXT: v_writelane_b32 v61, s14, 8 +; SI-NEXT: v_writelane_b32 v61, s15, 9 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; SI-NEXT: v_writelane_b32 v61, s14, 18 +; SI-NEXT: v_writelane_b32 v61, s15, 19 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 +; SI-NEXT: v_writelane_b32 v61, s14, 16 +; SI-NEXT: v_writelane_b32 v61, s15, 17 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 8 +; SI-NEXT: v_writelane_b32 v61, s14, 14 +; SI-NEXT: v_writelane_b32 v61, s15, 15 +; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v61, s14, 24 +; SI-NEXT: v_writelane_b32 v61, s15, 25 +; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v61, s14, 22 +; SI-NEXT: v_writelane_b32 v61, s15, 23 +; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 8 +; SI-NEXT: v_writelane_b32 v61, s14, 20 +; SI-NEXT: v_writelane_b32 v61, s15, 21 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 24 +; SI-NEXT: v_writelane_b32 v61, s14, 32 +; SI-NEXT: v_writelane_b32 v61, s15, 33 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 +; SI-NEXT: v_writelane_b32 v61, s14, 30 +; SI-NEXT: v_writelane_b32 v61, s15, 31 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 8 +; SI-NEXT: v_writelane_b32 v61, s14, 28 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 24, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 24, v5 +; SI-NEXT: v_writelane_b32 v61, s15, 29 +; SI-NEXT: s_lshr_b32 s78, s96, 8 +; SI-NEXT: s_lshr_b32 s15, s26, 8 +; SI-NEXT: s_mov_b32 s14, s20 +; SI-NEXT: s_lshr_b32 s6, vcc_lo, 8 +; SI-NEXT: v_mov_b32_e32 v14, v4 +; SI-NEXT: v_mov_b32_e32 v4, v6 ; SI-NEXT: .LBB91_5: ; %end -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 -; SI-NEXT: s_lshl_b32 s5, s10, 8 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s6, 0xff -; SI-NEXT: v_readlane_b32 s6, v62, 0 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s6, 24 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: s_and_b32 s5, s8, 0xff +; SI-NEXT: v_readlane_b32 s8, v62, 0 +; SI-NEXT: v_readlane_b32 s9, v62, 1 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 2 +; SI-NEXT: v_readlane_b32 s9, v62, 3 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 vcc_lo, v62, 4 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, vcc_lo, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_and_b32 s5, s96, 0xff +; SI-NEXT: s_lshl_b32 s8, s78, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s7, 0xff ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v44 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v16 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s5, s16, 8 -; SI-NEXT: s_lshl_b32 s6, s8, 24 -; SI-NEXT: v_add_i32_e32 v5, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: s_lshl_b32 s4, s4, 8 -; SI-NEXT: v_readlane_b32 s7, v62, 1 -; SI-NEXT: v_readlane_b32 s99, v63, 35 -; SI-NEXT: v_readlane_b32 s97, v63, 33 -; SI-NEXT: v_readlane_b32 s87, v63, 31 -; SI-NEXT: v_readlane_b32 s85, v63, 29 -; SI-NEXT: v_readlane_b32 s83, v63, 27 -; SI-NEXT: v_readlane_b32 s81, v63, 25 -; SI-NEXT: v_readlane_b32 s71, v63, 23 -; SI-NEXT: v_readlane_b32 s69, v63, 21 -; SI-NEXT: v_readlane_b32 s67, v63, 19 -; SI-NEXT: v_readlane_b32 s65, v63, 17 -; SI-NEXT: v_readlane_b32 s55, v63, 15 -; SI-NEXT: v_readlane_b32 s53, v63, 13 -; SI-NEXT: v_readlane_b32 s51, v63, 11 -; SI-NEXT: v_readlane_b32 s49, v63, 9 -; SI-NEXT: v_readlane_b32 s39, v63, 7 -; SI-NEXT: v_readlane_b32 s37, v63, 5 -; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v48 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 6 +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 5 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s12, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s5, s86, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 7 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 8 +; SI-NEXT: v_readlane_b32 s9, v62, 9 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 vcc_lo, v62, 10 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, vcc_lo, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 11 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s84, 0xff +; SI-NEXT: s_lshl_b32 s8, s61, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s65, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 12 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s80, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 13 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 14 +; SI-NEXT: v_readlane_b32 s9, v62, 15 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s60, v62, 16 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s60, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v52 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s70, 0xff +; SI-NEXT: s_lshl_b32 s8, s72, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s5, s22, 8 -; SI-NEXT: s_lshl_b32 s6, s14, 24 -; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s69, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v15 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 18 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s66, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 19 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s61, v62, 17 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 20 +; SI-NEXT: v_readlane_b32 s9, v62, 21 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s60, v62, 22 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s60, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s28, 8 -; SI-NEXT: s_lshl_b32 s6, s20, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s38, 0xff +; SI-NEXT: s_lshl_b32 s8, s75, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s91, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 24 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s52, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 25 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s61, v62, 23 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 26 +; SI-NEXT: v_readlane_b32 s9, v62, 27 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s60, v62, 28 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s60, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v9 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s98, 0xff +; SI-NEXT: s_lshl_b32 s8, s58, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s5, s44, 8 -; SI-NEXT: s_lshl_b32 s6, s26, 24 -; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s40, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s37, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v11 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 30 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s30, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 31 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s61, v62, 29 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 32 +; SI-NEXT: v_readlane_b32 s9, v62, 33 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s60, v62, 34 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s60, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v13 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s82, 0xff +; SI-NEXT: s_lshl_b32 s8, s43, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s5, s58, 8 -; SI-NEXT: s_lshl_b32 s6, s42, 24 -; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s46, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s89, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v47 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 36 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s5, s50, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 37 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 38 +; SI-NEXT: v_readlane_b32 s9, v62, 39 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s42, v62, 40 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s42, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v12 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s72, 8 -; SI-NEXT: s_lshl_b32 s6, s56, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s94, 0xff +; SI-NEXT: s_lshl_b32 s8, s44, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s60, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s57, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 42 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s5, s92, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 43 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s43, v62, 41 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 44 +; SI-NEXT: v_readlane_b32 s9, v62, 45 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s42, v62, 46 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s42, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s78, 8 -; SI-NEXT: s_lshl_b32 s6, s62, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s64, 0xff +; SI-NEXT: s_lshl_b32 s8, s27, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s74, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s79, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v18 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 48 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s76, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 49 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 50 +; SI-NEXT: v_readlane_b32 s9, v62, 51 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s26, v62, 52 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s26, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v61 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v10 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s92, 8 -; SI-NEXT: s_lshl_b32 s6, s76, 24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s90, 0xff +; SI-NEXT: s_lshl_b32 s8, s28, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s88, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s73, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v55 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 54 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s62, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 55 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s27, v62, 53 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 56 +; SI-NEXT: v_readlane_b32 s9, v62, 57 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s26, v62, 58 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s26, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v60 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s34, 8 -; SI-NEXT: s_lshl_b32 s6, s90, 24 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s54, 0xff +; SI-NEXT: s_lshl_b32 s8, s74, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s94, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s59, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 60 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s46, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 61 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s27, v62, 59 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 62 +; SI-NEXT: v_readlane_b32 s9, v62, 63 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s26, v61, 0 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s26, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v18 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s38, 8 -; SI-NEXT: s_lshl_b32 s6, s30, 24 -; SI-NEXT: v_readlane_b32 s38, v63, 6 -; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s68, 0xff +; SI-NEXT: s_lshl_b32 s8, s22, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s36, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s45, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v61, 2 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s5, s40, 0xff +; SI-NEXT: v_readlane_b32 s9, v61, 3 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s27, v61, 1 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v61, 4 +; SI-NEXT: v_readlane_b32 s9, v61, 5 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s26, v61, 6 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s26, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 -; SI-NEXT: s_lshl_b32 s5, s52, 8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s6, s48, 24 -; SI-NEXT: v_readlane_b32 s52, v63, 12 -; SI-NEXT: v_readlane_b32 s48, v63, 8 -; SI-NEXT: v_readlane_b32 s36, v63, 4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s56, 0xff +; SI-NEXT: s_lshl_b32 s8, s21, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s50, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s29, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v14 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v61, 8 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: v_readlane_b32 s9, v61, 9 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v61, 10 +; SI-NEXT: v_readlane_b32 s9, v61, 11 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s20, v61, 12 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s20, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s5, s68, 8 -; SI-NEXT: s_lshl_b32 s6, s54, 24 -; SI-NEXT: v_readlane_b32 s68, v63, 20 -; SI-NEXT: v_readlane_b32 s54, v63, 14 -; SI-NEXT: v_readlane_b32 s50, v63, 10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s34, 0xff +; SI-NEXT: s_lshl_b32 s8, s18, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s64, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s23, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v56 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v61, 14 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s16, 0xff +; SI-NEXT: v_readlane_b32 s9, v61, 15 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v61, 16 +; SI-NEXT: v_readlane_b32 s9, v61, 17 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s16, v61, 18 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s16, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s82, 8 -; SI-NEXT: s_lshl_b32 s6, s66, 24 -; SI-NEXT: v_readlane_b32 s82, v63, 26 -; SI-NEXT: v_readlane_b32 s66, v63, 18 -; SI-NEXT: v_readlane_b32 s64, v63, 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s70, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s5, s36, 0xff +; SI-NEXT: s_lshl_b32 s8, s15, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s19, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v57 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s5, s96, 8 -; SI-NEXT: s_lshl_b32 s6, s80, 24 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: v_readlane_b32 s61, v62, 35 +; SI-NEXT: v_readlane_b32 s43, v62, 47 +; SI-NEXT: v_readlane_b32 s27, v61, 7 +; SI-NEXT: v_readlane_b32 s21, v61, 13 +; SI-NEXT: v_readlane_b32 s17, v61, 19 +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: v_readlane_b32 s97, v63, 33 ; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s84, v63, 28 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s81, v63, 25 ; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 ; SI-NEXT: v_readlane_b32 s70, v63, 22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v61, 20 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: s_and_b32 s5, s10, 0xff +; SI-NEXT: v_readlane_b32 s9, v61, 21 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v61, 22 +; SI-NEXT: v_readlane_b32 s9, v61, 23 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s10, v61, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s10, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s5, s14, 0xff +; SI-NEXT: s_lshl_b32 s8, s12, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v61, 26 +; SI-NEXT: v_readlane_b32 s9, v61, 27 +; SI-NEXT: s_and_b32 s8, s9, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: v_readlane_b32 s11, v61, 25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v61, 28 +; SI-NEXT: v_readlane_b32 s9, v61, 29 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s84, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: v_readlane_b32 s8, v61, 30 +; SI-NEXT: v_readlane_b32 s9, v61, 31 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s8, 0xff +; SI-NEXT: v_readlane_b32 s8, v61, 32 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s5, s86, 24 -; SI-NEXT: v_readlane_b32 s86, v63, 30 -; SI-NEXT: v_readlane_b32 s84, v63, 28 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v35 +; SI-NEXT: s_and_b32 s4, s88, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s13, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_readlane_b32 s9, v61, 33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s98, 0xff -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 -; SI-NEXT: v_readlane_b32 s98, v63, 34 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -168591,8 +169636,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: v_writelane_b32 v63, s30, 0 ; VI-NEXT: v_writelane_b32 v63, s31, 1 @@ -168627,209 +169673,225 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_writelane_b32 v63, s86, 30 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; VI-NEXT: v_writelane_b32 v63, s87, 31 -; VI-NEXT: v_readfirstlane_b32 s44, v3 -; VI-NEXT: v_readfirstlane_b32 s45, v4 -; VI-NEXT: v_readfirstlane_b32 s42, v5 -; VI-NEXT: v_readfirstlane_b32 s43, v6 -; VI-NEXT: v_readfirstlane_b32 s40, v7 -; VI-NEXT: v_readfirstlane_b32 s41, v8 -; VI-NEXT: v_readfirstlane_b32 s14, v9 -; VI-NEXT: v_readfirstlane_b32 s15, v10 -; VI-NEXT: v_readfirstlane_b32 s12, v11 -; VI-NEXT: v_readfirstlane_b32 s13, v12 -; VI-NEXT: v_readfirstlane_b32 s10, v13 -; VI-NEXT: v_readfirstlane_b32 s11, v14 -; VI-NEXT: v_readfirstlane_b32 s8, v15 -; VI-NEXT: v_readfirstlane_b32 s9, v16 -; VI-NEXT: v_readfirstlane_b32 s6, v17 -; VI-NEXT: v_readfirstlane_b32 s7, v18 +; VI-NEXT: v_readfirstlane_b32 s48, v3 +; VI-NEXT: v_readfirstlane_b32 s49, v4 +; VI-NEXT: v_readfirstlane_b32 s38, v5 +; VI-NEXT: v_readfirstlane_b32 s39, v6 +; VI-NEXT: v_readfirstlane_b32 s36, v7 +; VI-NEXT: v_readfirstlane_b32 s37, v8 +; VI-NEXT: v_readfirstlane_b32 s34, v9 +; VI-NEXT: v_readfirstlane_b32 s35, v10 +; VI-NEXT: v_readfirstlane_b32 s30, v11 +; VI-NEXT: v_readfirstlane_b32 s31, v12 +; VI-NEXT: v_readfirstlane_b32 s90, v13 +; VI-NEXT: v_readfirstlane_b32 s91, v14 +; VI-NEXT: v_readfirstlane_b32 s88, v15 +; VI-NEXT: v_readfirstlane_b32 s89, v16 +; VI-NEXT: v_readfirstlane_b32 s76, v17 +; VI-NEXT: v_readfirstlane_b32 s77, v18 ; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: s_and_b64 s[6:7], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane ; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; VI-NEXT: s_cbranch_scc0 .LBB91_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s46, s5, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 57 -; VI-NEXT: s_lshr_b32 s46, s5, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 56 -; VI-NEXT: s_lshr_b32 s46, s5, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 55 -; VI-NEXT: s_lshr_b32 s46, s4, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 54 -; VI-NEXT: s_lshr_b32 s46, s4, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 53 -; VI-NEXT: s_lshr_b32 s46, s29, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 52 -; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 51 -; VI-NEXT: s_lshr_b32 s46, s29, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 50 -; VI-NEXT: s_lshr_b32 s46, s28, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 49 -; VI-NEXT: s_lshr_b32 s46, s28, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 48 -; VI-NEXT: s_lshr_b32 s46, s27, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 47 -; VI-NEXT: s_lshr_b32 s46, s27, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 46 -; VI-NEXT: s_lshr_b32 s46, s27, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 45 -; VI-NEXT: s_lshr_b32 s46, s26, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 44 -; VI-NEXT: s_lshr_b32 s46, s26, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 43 -; VI-NEXT: s_lshr_b32 s46, s25, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 42 -; VI-NEXT: s_lshr_b32 s46, s25, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 41 -; VI-NEXT: s_lshr_b32 s46, s25, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 40 -; VI-NEXT: s_lshr_b32 s46, s24, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 39 -; VI-NEXT: s_lshr_b32 s46, s24, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 38 -; VI-NEXT: s_lshr_b32 s46, s23, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 37 -; VI-NEXT: s_lshr_b32 s46, s23, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 36 -; VI-NEXT: s_lshr_b32 s46, s23, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 35 -; VI-NEXT: s_lshr_b32 s46, s22, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 34 -; VI-NEXT: s_lshr_b32 s46, s22, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 33 -; VI-NEXT: s_lshr_b32 s46, s21, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 32 -; VI-NEXT: s_lshr_b32 s46, s21, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 31 -; VI-NEXT: s_lshr_b32 s46, s21, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 30 -; VI-NEXT: s_lshr_b32 s46, s20, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 29 -; VI-NEXT: s_lshr_b32 s46, s20, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 28 -; VI-NEXT: s_lshr_b32 s46, s19, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 27 -; VI-NEXT: s_lshr_b32 s46, s19, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 26 -; VI-NEXT: s_lshr_b32 s46, s19, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 25 -; VI-NEXT: s_lshr_b32 s46, s18, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 24 -; VI-NEXT: s_lshr_b32 s46, s18, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 23 -; VI-NEXT: s_lshr_b32 s46, s17, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 22 -; VI-NEXT: s_lshr_b32 s46, s17, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 21 -; VI-NEXT: s_lshr_b32 s46, s17, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 20 -; VI-NEXT: s_lshr_b32 s46, s16, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 19 -; VI-NEXT: s_lshr_b32 s46, s16, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 18 -; VI-NEXT: s_lshr_b32 s46, s7, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 17 -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 16 -; VI-NEXT: s_lshr_b32 s46, s7, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 15 -; VI-NEXT: s_lshr_b32 s46, s6, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 14 -; VI-NEXT: s_lshr_b32 s46, s6, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 13 -; VI-NEXT: s_lshr_b32 s46, s9, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 12 -; VI-NEXT: s_lshr_b32 s46, s9, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 11 -; VI-NEXT: s_lshr_b32 s46, s9, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 10 -; VI-NEXT: s_lshr_b32 s46, s8, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 9 -; VI-NEXT: s_lshr_b32 s46, s8, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 8 -; VI-NEXT: s_lshr_b32 s46, s11, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 7 -; VI-NEXT: s_lshr_b32 s46, s11, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 6 -; VI-NEXT: s_lshr_b32 s46, s11, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 5 -; VI-NEXT: s_lshr_b32 s46, s10, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 4 -; VI-NEXT: s_lshr_b32 s46, s10, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 3 -; VI-NEXT: s_lshr_b32 s46, s13, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 2 -; VI-NEXT: s_lshr_b32 s46, s13, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 1 -; VI-NEXT: s_lshr_b32 s46, s12, 16 -; VI-NEXT: s_lshr_b32 s80, s13, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 0 -; VI-NEXT: s_lshr_b32 s81, s12, 8 -; VI-NEXT: s_lshr_b32 s82, s15, 24 -; VI-NEXT: s_lshr_b32 s83, s15, 16 -; VI-NEXT: s_lshr_b32 s85, s15, 8 -; VI-NEXT: s_lshr_b32 s84, s14, 16 -; VI-NEXT: s_lshr_b32 s86, s14, 8 -; VI-NEXT: s_lshr_b32 s87, s41, 24 -; VI-NEXT: s_lshr_b32 s50, s41, 16 -; VI-NEXT: s_lshr_b32 s52, s41, 8 -; VI-NEXT: s_lshr_b32 s51, s40, 16 -; VI-NEXT: s_lshr_b32 s53, s40, 8 -; VI-NEXT: s_lshr_b32 s54, s43, 24 -; VI-NEXT: s_lshr_b32 s55, s43, 16 -; VI-NEXT: s_lshr_b32 s65, s43, 8 -; VI-NEXT: s_lshr_b32 s64, s42, 16 -; VI-NEXT: s_lshr_b32 s66, s42, 8 -; VI-NEXT: s_lshr_b32 s67, s45, 24 -; VI-NEXT: s_lshr_b32 s68, s45, 16 -; VI-NEXT: s_lshr_b32 s70, s45, 8 -; VI-NEXT: s_lshr_b32 s69, s44, 16 -; VI-NEXT: s_lshr_b32 s71, s44, 8 -; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[56:57], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[8:9], 24 -; VI-NEXT: s_lshr_b64 s[90:91], s[10:11], 24 -; VI-NEXT: s_lshr_b64 s[30:31], s[12:13], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[36:37], s[40:41], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[44:45], 24 +; VI-NEXT: s_lshr_b32 s6, s5, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 26 +; VI-NEXT: s_lshr_b32 s6, s5, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 27 +; VI-NEXT: s_lshr_b32 s6, s5, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 28 +; VI-NEXT: s_lshr_b32 s6, s4, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 29 +; VI-NEXT: s_lshr_b32 s6, s4, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 30 +; VI-NEXT: s_lshr_b32 s6, s29, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 31 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 32 +; VI-NEXT: s_lshr_b32 s6, s29, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 33 +; VI-NEXT: s_lshr_b32 s6, s28, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 34 +; VI-NEXT: s_lshr_b32 s6, s28, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 35 +; VI-NEXT: s_lshr_b32 s6, s27, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 36 +; VI-NEXT: s_lshr_b32 s6, s27, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 37 +; VI-NEXT: s_lshr_b32 s6, s27, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 38 +; VI-NEXT: s_lshr_b32 s6, s26, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 39 +; VI-NEXT: s_lshr_b32 s6, s26, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 40 +; VI-NEXT: s_lshr_b32 s6, s25, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 41 +; VI-NEXT: s_lshr_b32 s6, s25, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 42 +; VI-NEXT: s_lshr_b32 s6, s25, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 43 +; VI-NEXT: s_lshr_b32 s6, s24, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 44 +; VI-NEXT: s_lshr_b32 s6, s24, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 45 +; VI-NEXT: s_lshr_b32 s6, s23, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 46 +; VI-NEXT: s_lshr_b32 s6, s23, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 47 +; VI-NEXT: s_lshr_b32 s6, s23, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 48 +; VI-NEXT: s_lshr_b32 s6, s22, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 49 +; VI-NEXT: s_lshr_b32 s6, s22, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 50 +; VI-NEXT: s_lshr_b32 s6, s21, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 51 +; VI-NEXT: s_lshr_b32 s6, s21, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 52 +; VI-NEXT: s_lshr_b32 s6, s21, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 53 +; VI-NEXT: s_lshr_b32 s6, s20, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 54 +; VI-NEXT: s_lshr_b32 s6, s20, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 55 +; VI-NEXT: s_lshr_b32 s6, s19, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 56 +; VI-NEXT: s_lshr_b32 s6, s19, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 57 +; VI-NEXT: s_lshr_b32 s6, s19, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 58 +; VI-NEXT: s_lshr_b32 s6, s18, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 59 +; VI-NEXT: s_lshr_b32 s6, s18, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 60 +; VI-NEXT: s_lshr_b32 s6, s17, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 61 +; VI-NEXT: s_lshr_b32 s6, s17, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 62 +; VI-NEXT: s_lshr_b32 s6, s17, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 63 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_writelane_b32 v62, s6, 0 +; VI-NEXT: s_lshr_b32 s6, s16, 8 +; VI-NEXT: v_writelane_b32 v62, s6, 1 +; VI-NEXT: s_lshr_b32 s6, s39, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 18 +; VI-NEXT: s_lshr_b32 s6, s39, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 19 +; VI-NEXT: s_lshr_b32 s6, s39, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 20 +; VI-NEXT: s_lshr_b32 s6, s38, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 16 +; VI-NEXT: s_lshr_b32 s6, s38, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 17 +; VI-NEXT: s_lshr_b32 s6, s49, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 23 +; VI-NEXT: s_lshr_b32 s6, s49, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 24 +; VI-NEXT: s_lshr_b32 s6, s49, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 25 +; VI-NEXT: s_lshr_b32 s6, s48, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 21 +; VI-NEXT: s_lshr_b32 s6, s48, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 22 +; VI-NEXT: s_lshr_b64 vcc, s[4:5], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 14 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 15 +; VI-NEXT: s_lshr_b64 vcc, s[28:29], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 12 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 13 +; VI-NEXT: s_lshr_b64 vcc, s[26:27], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 10 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 11 +; VI-NEXT: s_lshr_b64 vcc, s[24:25], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 8 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 9 +; VI-NEXT: s_lshr_b64 vcc, s[22:23], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 6 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 7 +; VI-NEXT: s_lshr_b64 vcc, s[20:21], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 4 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 5 +; VI-NEXT: s_lshr_b64 vcc, s[18:19], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 2 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 3 +; VI-NEXT: s_lshr_b64 vcc, s[16:17], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 0 +; VI-NEXT: s_lshr_b32 s87, s77, 24 +; VI-NEXT: s_lshr_b32 s43, s77, 16 +; VI-NEXT: s_lshr_b32 s42, s77, 8 +; VI-NEXT: s_lshr_b32 s13, s76, 16 +; VI-NEXT: s_lshr_b32 s11, s76, 8 +; VI-NEXT: s_lshr_b32 s86, s89, 24 +; VI-NEXT: s_lshr_b32 s85, s89, 16 +; VI-NEXT: s_lshr_b32 s84, s89, 8 +; VI-NEXT: s_lshr_b32 s9, s88, 16 +; VI-NEXT: s_lshr_b32 s7, s88, 8 +; VI-NEXT: s_lshr_b32 s75, s91, 24 +; VI-NEXT: s_lshr_b32 s74, s91, 16 +; VI-NEXT: s_lshr_b32 s73, s91, 8 +; VI-NEXT: s_lshr_b32 s79, s90, 16 +; VI-NEXT: s_lshr_b32 s78, s90, 8 +; VI-NEXT: s_lshr_b32 s60, s31, 24 +; VI-NEXT: s_lshr_b32 s15, s31, 16 +; VI-NEXT: s_lshr_b32 s14, s31, 8 +; VI-NEXT: s_lshr_b32 s72, s30, 16 +; VI-NEXT: s_lshr_b32 s61, s30, 8 +; VI-NEXT: s_lshr_b32 s63, s35, 24 +; VI-NEXT: s_lshr_b32 s57, s35, 16 +; VI-NEXT: s_lshr_b32 s56, s35, 8 +; VI-NEXT: s_lshr_b32 s83, s34, 16 +; VI-NEXT: s_lshr_b32 s82, s34, 8 +; VI-NEXT: s_lshr_b32 s41, s37, 24 +; VI-NEXT: s_lshr_b32 s47, s37, 16 +; VI-NEXT: s_lshr_b32 s46, s37, 8 +; VI-NEXT: s_lshr_b32 s59, s36, 16 +; VI-NEXT: s_lshr_b32 s45, s36, 8 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 1 +; VI-NEXT: s_lshr_b64 s[50:51], s[76:77], 24 +; VI-NEXT: s_lshr_b64 s[52:53], s[88:89], 24 +; VI-NEXT: s_lshr_b64 s[54:55], s[90:91], 24 +; VI-NEXT: s_lshr_b64 s[64:65], s[30:31], 24 +; VI-NEXT: s_lshr_b64 s[66:67], s[34:35], 24 +; VI-NEXT: s_lshr_b64 s[68:69], s[36:37], 24 +; VI-NEXT: s_lshr_b64 s[70:71], s[38:39], 24 +; VI-NEXT: s_lshr_b64 s[80:81], s[48:49], 24 +; VI-NEXT: s_mov_b32 s6, s17 +; VI-NEXT: s_mov_b32 s8, s19 +; VI-NEXT: s_mov_b32 s10, s21 +; VI-NEXT: s_mov_b32 s12, s23 +; VI-NEXT: s_mov_b32 s40, s25 +; VI-NEXT: s_mov_b32 s44, s27 +; VI-NEXT: s_mov_b32 s58, s29 +; VI-NEXT: s_mov_b32 s62, s5 ; VI-NEXT: s_cbranch_execnz .LBB91_4 ; VI-NEXT: .LBB91_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s46, s45, 16 -; VI-NEXT: v_mov_b32_e32 v31, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s46, v31 +; VI-NEXT: s_lshl_b32 s6, s49, 16 +; VI-NEXT: v_mov_b32_e32 v25, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s6, v25 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s45, s45, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s49, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s45, v31 +; VI-NEXT: v_add_f32_e32 v2, s6, v25 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 @@ -168837,53 +169899,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s45, s44, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s45, v31 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_lshrrev_b64 v[1:2], 16, v[1:2] +; VI-NEXT: s_lshl_b32 s6, s48, 16 +; VI-NEXT: v_add_f32_e32 v2, s6, v25 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s44, s44, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s44, v31 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_lshl_b32 s44, s43, 16 -; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; VI-NEXT: v_add_f32_e32 v3, s44, v31 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s6, s48, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s6, v25 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s43, s43, 0xffff0000 +; VI-NEXT: s_lshl_b32 s6, s39, 16 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s43, v31 +; VI-NEXT: v_add_f32_e32 v4, s6, v25 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_and_b32 s6, s39, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s43, s42, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v3, 16 -; VI-NEXT: v_add_f32_e32 v3, s43, v31 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s42, s42, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s42, v31 +; VI-NEXT: v_add_f32_e32 v5, s6, v25 ; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 @@ -168891,53 +169933,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: s_lshl_b32 s42, s41, 16 -; VI-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; VI-NEXT: v_add_f32_e32 v5, s42, v31 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] +; VI-NEXT: s_lshl_b32 s6, s38, 16 +; VI-NEXT: v_add_f32_e32 v5, s6, v25 ; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_and_b32 s41, s41, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s38, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s41, v31 +; VI-NEXT: v_add_f32_e32 v6, s6, v25 ; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 ; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: s_lshl_b32 s6, s37, 16 ; VI-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: s_lshl_b32 s41, s40, 16 -; VI-NEXT: v_alignbit_b32 v6, v6, v5, 16 -; VI-NEXT: v_add_f32_e32 v5, s41, v31 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_and_b32 s40, s40, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; VI-NEXT: v_add_f32_e32 v7, s40, v31 -; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: s_lshl_b32 s40, s15, 16 -; VI-NEXT: v_alignbit_b32 v5, v7, v5, 16 -; VI-NEXT: v_add_f32_e32 v7, s40, v31 +; VI-NEXT: v_add_f32_e32 v7, s6, v25 ; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 ; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s15, s15, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s37, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s15, v31 +; VI-NEXT: v_add_f32_e32 v8, s6, v25 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 @@ -168945,53 +169967,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: s_lshl_b32 s15, s14, 16 -; VI-NEXT: v_alignbit_b32 v8, v8, v7, 16 -; VI-NEXT: v_add_f32_e32 v7, s15, v31 -; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[7:8] +; VI-NEXT: s_lshl_b32 s6, s36, 16 +; VI-NEXT: v_add_f32_e32 v8, s6, v25 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s14, s14, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; VI-NEXT: v_add_f32_e32 v9, s14, v31 -; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: s_lshl_b32 s14, s13, 16 -; VI-NEXT: v_alignbit_b32 v7, v9, v7, 16 -; VI-NEXT: v_add_f32_e32 v9, s14, v31 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: s_and_b32 s6, s36, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_add_f32_e32 v9, s6, v25 ; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 ; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 ; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: s_and_b32 s13, s13, 0xffff0000 +; VI-NEXT: s_lshl_b32 s6, s35, 16 ; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc -; VI-NEXT: v_add_f32_e32 v10, s13, v31 +; VI-NEXT: v_add_f32_e32 v10, s6, v25 ; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 ; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_or_b32_e32 v12, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: s_and_b32 s6, s35, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: s_lshl_b32 s13, s12, 16 -; VI-NEXT: v_alignbit_b32 v10, v10, v9, 16 -; VI-NEXT: v_add_f32_e32 v9, s13, v31 -; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: s_and_b32 s12, s12, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc -; VI-NEXT: v_add_f32_e32 v11, s12, v31 +; VI-NEXT: v_add_f32_e32 v11, s6, v25 ; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 ; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 @@ -168999,53 +170001,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: s_lshl_b32 s12, s11, 16 -; VI-NEXT: v_alignbit_b32 v9, v11, v9, 16 -; VI-NEXT: v_add_f32_e32 v11, s12, v31 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[10:11] +; VI-NEXT: s_lshl_b32 s6, s34, 16 +; VI-NEXT: v_add_f32_e32 v11, s6, v25 ; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 ; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 ; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s11, s11, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s34, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc -; VI-NEXT: v_add_f32_e32 v12, s11, v31 +; VI-NEXT: v_add_f32_e32 v12, s6, v25 ; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 ; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: s_lshl_b32 s6, s31, 16 ; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: s_lshl_b32 s11, s10, 16 -; VI-NEXT: v_alignbit_b32 v12, v12, v11, 16 -; VI-NEXT: v_add_f32_e32 v11, s11, v31 -; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s10, s10, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s10, v31 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: s_lshl_b32 s10, s9, 16 -; VI-NEXT: v_alignbit_b32 v11, v13, v11, 16 -; VI-NEXT: v_add_f32_e32 v13, s10, v31 +; VI-NEXT: v_add_f32_e32 v13, s6, v25 ; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 ; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 ; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s9, s9, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s31, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_add_f32_e32 v14, s9, v31 +; VI-NEXT: v_add_f32_e32 v14, s6, v25 ; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 ; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 @@ -169053,53 +170035,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: s_lshl_b32 s9, s8, 16 -; VI-NEXT: v_alignbit_b32 v14, v14, v13, 16 -; VI-NEXT: v_add_f32_e32 v13, s9, v31 -; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_lshrrev_b64 v[13:14], 16, v[13:14] +; VI-NEXT: s_lshl_b32 s6, s30, 16 +; VI-NEXT: v_add_f32_e32 v14, s6, v25 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s8, s8, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc -; VI-NEXT: v_add_f32_e32 v15, s8, v31 -; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: s_lshl_b32 s8, s7, 16 -; VI-NEXT: v_alignbit_b32 v13, v15, v13, 16 -; VI-NEXT: v_add_f32_e32 v15, s8, v31 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: s_and_b32 s6, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; VI-NEXT: v_add_f32_e32 v15, s6, v25 ; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 ; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 ; VI-NEXT: v_or_b32_e32 v17, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: s_and_b32 s7, s7, 0xffff0000 +; VI-NEXT: s_lshl_b32 s6, s91, 16 ; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s7, v31 +; VI-NEXT: v_add_f32_e32 v16, s6, v25 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: s_and_b32 s6, s91, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: s_lshl_b32 s7, s6, 16 -; VI-NEXT: v_alignbit_b32 v16, v16, v15, 16 -; VI-NEXT: v_add_f32_e32 v15, s7, v31 -; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: s_and_b32 s6, s6, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; VI-NEXT: v_add_f32_e32 v17, s6, v31 +; VI-NEXT: v_add_f32_e32 v17, s6, v25 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 @@ -169107,53 +170069,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: s_lshl_b32 s6, s17, 16 -; VI-NEXT: v_alignbit_b32 v15, v17, v15, 16 -; VI-NEXT: v_add_f32_e32 v17, s6, v31 +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] +; VI-NEXT: s_lshl_b32 s6, s90, 16 +; VI-NEXT: v_add_f32_e32 v17, s6, v25 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s90, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_add_f32_e32 v18, s6, v31 +; VI-NEXT: v_add_f32_e32 v18, s6, v25 ; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 ; VI-NEXT: v_or_b32_e32 v20, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: s_lshl_b32 s6, s89, 16 ; VI-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: s_lshl_b32 s6, s16, 16 -; VI-NEXT: v_alignbit_b32 v18, v18, v17, 16 -; VI-NEXT: v_add_f32_e32 v17, s6, v31 -; VI-NEXT: v_bfe_u32 v19, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v17 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 -; VI-NEXT: v_or_b32_e32 v20, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v17, v19, v20, vcc -; VI-NEXT: v_add_f32_e32 v19, s6, v31 -; VI-NEXT: v_bfe_u32 v20, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v19 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20 -; VI-NEXT: v_or_b32_e32 v21, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: s_lshl_b32 s6, s19, 16 -; VI-NEXT: v_alignbit_b32 v17, v19, v17, 16 -; VI-NEXT: v_add_f32_e32 v19, s6, v31 +; VI-NEXT: v_add_f32_e32 v19, s6, v25 ; VI-NEXT: v_bfe_u32 v20, v19, 16, 1 ; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v19 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20 ; VI-NEXT: v_or_b32_e32 v21, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s89, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc -; VI-NEXT: v_add_f32_e32 v20, s6, v31 +; VI-NEXT: v_add_f32_e32 v20, s6, v25 ; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 ; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 @@ -169161,863 +170103,1089 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: s_lshl_b32 s6, s18, 16 -; VI-NEXT: v_alignbit_b32 v20, v20, v19, 16 -; VI-NEXT: v_add_f32_e32 v19, s6, v31 -; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 +; VI-NEXT: v_lshrrev_b64 v[19:20], 16, v[19:20] +; VI-NEXT: s_lshl_b32 s6, s88, 16 +; VI-NEXT: v_add_f32_e32 v20, s6, v25 +; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 -; VI-NEXT: v_or_b32_e32 v22, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v19, v21, v22, vcc -; VI-NEXT: v_add_f32_e32 v21, s6, v31 -; VI-NEXT: v_bfe_u32 v22, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v21 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22 -; VI-NEXT: v_or_b32_e32 v23, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: s_lshl_b32 s6, s21, 16 -; VI-NEXT: v_alignbit_b32 v19, v21, v19, 16 -; VI-NEXT: v_add_f32_e32 v21, s6, v31 +; VI-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: s_and_b32 s6, s88, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc +; VI-NEXT: v_add_f32_e32 v21, s6, v25 ; VI-NEXT: v_bfe_u32 v22, v21, 16, 1 ; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v21 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22 ; VI-NEXT: v_or_b32_e32 v23, 0x400000, v21 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 +; VI-NEXT: s_lshl_b32 s6, s77, 16 ; VI-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc -; VI-NEXT: v_add_f32_e32 v22, s6, v31 +; VI-NEXT: v_add_f32_e32 v22, s6, v25 ; VI-NEXT: v_bfe_u32 v23, v22, 16, 1 ; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v22 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 ; VI-NEXT: v_or_b32_e32 v24, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: s_and_b32 s6, s77, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: s_lshl_b32 s6, s20, 16 -; VI-NEXT: v_alignbit_b32 v22, v22, v21, 16 -; VI-NEXT: v_add_f32_e32 v21, s6, v31 -; VI-NEXT: v_bfe_u32 v23, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v21 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 -; VI-NEXT: v_or_b32_e32 v24, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v21, v23, v24, vcc -; VI-NEXT: v_add_f32_e32 v23, s6, v31 +; VI-NEXT: v_add_f32_e32 v23, s6, v25 ; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 ; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v23 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 -; VI-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; VI-NEXT: v_or_b32_e32 v26, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc +; VI-NEXT: v_cndmask_b32_e32 v23, v24, v26, vcc ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: s_lshl_b32 s6, s23, 16 -; VI-NEXT: v_alignbit_b32 v21, v23, v21, 16 -; VI-NEXT: v_add_f32_e32 v23, s6, v31 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] +; VI-NEXT: s_lshl_b32 s6, s76, 16 +; VI-NEXT: v_add_f32_e32 v23, s6, v25 ; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 ; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v23 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 -; VI-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc -; VI-NEXT: v_add_f32_e32 v24, s6, v31 -; VI-NEXT: v_bfe_u32 v25, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v24 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 -; VI-NEXT: v_or_b32_e32 v26, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: s_lshl_b32 s6, s22, 16 -; VI-NEXT: v_alignbit_b32 v24, v24, v23, 16 -; VI-NEXT: v_add_f32_e32 v23, s6, v31 -; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 ; VI-NEXT: v_or_b32_e32 v26, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v23, v25, v26, vcc -; VI-NEXT: v_add_f32_e32 v25, s6, v31 -; VI-NEXT: v_bfe_u32 v26, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v25 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26 -; VI-NEXT: v_or_b32_e32 v27, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: s_lshl_b32 s6, s25, 16 -; VI-NEXT: v_alignbit_b32 v23, v25, v23, 16 -; VI-NEXT: v_add_f32_e32 v25, s6, v31 -; VI-NEXT: v_bfe_u32 v26, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v25 +; VI-NEXT: s_and_b32 s6, s76, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v23, v24, v26, vcc +; VI-NEXT: v_add_f32_e32 v24, s6, v25 +; VI-NEXT: v_bfe_u32 v26, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v24 ; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26 -; VI-NEXT: v_or_b32_e32 v27, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc -; VI-NEXT: v_add_f32_e32 v26, s6, v31 -; VI-NEXT: v_bfe_u32 v27, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v26 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 -; VI-NEXT: v_or_b32_e32 v28, 0x400000, v26 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v24, v26, v27, vcc +; VI-NEXT: v_add_f32_e32 v26, s6, v25 +; VI-NEXT: v_readfirstlane_b32 s6, v26 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: s_lshl_b32 s6, s24, 16 -; VI-NEXT: v_alignbit_b32 v26, v26, v25, 16 -; VI-NEXT: v_add_f32_e32 v25, s6, v31 -; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 -; VI-NEXT: v_or_b32_e32 v28, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v25, v27, v28, vcc -; VI-NEXT: v_add_f32_e32 v27, s6, v31 -; VI-NEXT: v_bfe_u32 v28, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v27 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: s_lshl_b32 s6, s27, 16 -; VI-NEXT: v_alignbit_b32 v25, v27, v25, 16 -; VI-NEXT: v_add_f32_e32 v27, s6, v31 -; VI-NEXT: v_bfe_u32 v28, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v27 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc -; VI-NEXT: v_add_f32_e32 v28, s6, v31 -; VI-NEXT: v_bfe_u32 v29, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v28 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 -; VI-NEXT: v_or_b32_e32 v30, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: s_lshl_b32 s6, s26, 16 -; VI-NEXT: v_alignbit_b32 v28, v28, v27, 16 -; VI-NEXT: v_add_f32_e32 v27, s6, v31 -; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 -; VI-NEXT: v_or_b32_e32 v30, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v27, v29, v30, vcc -; VI-NEXT: v_add_f32_e32 v29, s6, v31 -; VI-NEXT: v_bfe_u32 v30, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v29 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 -; VI-NEXT: v_or_b32_e32 v32, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v30, v32, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: s_lshl_b32 s6, s29, 16 -; VI-NEXT: v_alignbit_b32 v27, v29, v27, 16 -; VI-NEXT: v_add_f32_e32 v29, s6, v31 -; VI-NEXT: v_bfe_u32 v30, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v29 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 -; VI-NEXT: v_or_b32_e32 v32, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: s_and_b32 s6, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v29, v30, v32, vcc -; VI-NEXT: v_add_f32_e32 v30, s6, v31 -; VI-NEXT: v_bfe_u32 v32, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v30 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: s_lshl_b32 s6, s28, 16 -; VI-NEXT: v_alignbit_b32 v30, v30, v29, 16 -; VI-NEXT: v_add_f32_e32 v29, s6, v31 -; VI-NEXT: v_bfe_u32 v32, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v29 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc -; VI-NEXT: v_add_f32_e32 v32, s6, v31 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: s_lshl_b32 s6, s5, 16 -; VI-NEXT: v_alignbit_b32 v29, v32, v29, 16 -; VI-NEXT: v_add_f32_e32 v32, s6, v31 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s9, s8 +; VI-NEXT: s_and_b32 s7, s17, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 +; VI-NEXT: s_add_i32 s8, s8, s7 +; VI-NEXT: s_add_i32 s10, s8, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s10 +; VI-NEXT: s_lshr_b32 s7, s7, 16 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; VI-NEXT: s_lshl_b32 s7, s16, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 +; VI-NEXT: s_add_i32 s8, s8, s7 +; VI-NEXT: s_add_i32 s10, s8, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s8, s7, s10 +; VI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s9, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s19, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_lshr_b64 s[16:17], s[8:9], 16 +; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 +; VI-NEXT: s_add_i32 s8, s8, s7 +; VI-NEXT: s_add_i32 s10, s8, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s8, s7, s10 +; VI-NEXT: s_and_b32 s7, s19, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s9, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s18, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s10, s7, s9 +; VI-NEXT: s_and_b32 s7, s18, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s11, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s21, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[18:19], s[10:11], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s10, s7, s9 +; VI-NEXT: s_and_b32 s7, s21, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s11, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s20, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s12, s7, s9 +; VI-NEXT: s_and_b32 s7, s20, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s13, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s23, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s12, s7, s9 +; VI-NEXT: s_and_b32 s7, s23, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s13, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s22, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[12:13], s[12:13], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s22, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[22:23], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s25, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[22:23], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s25, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[40:41], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s24, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s24, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[24:25], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s27, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[24:25], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s27, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s26, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s26, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[26:27], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s29, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s29, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s28, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[58:59], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s28, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[28:29], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s5, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[28:29], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_add_f32_e32 v33, s5, v31 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_add_f32_e32 v26, s5, v25 +; VI-NEXT: v_readfirstlane_b32 s5, v26 +; VI-NEXT: s_bfe_u32 s7, s5, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s5 +; VI-NEXT: s_addk_i32 s7, 0x7fff +; VI-NEXT: s_bitset1_b32 s5, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s5, s5, s7 +; VI-NEXT: s_lshr_b32 s15, s5, 16 ; VI-NEXT: s_lshl_b32 s5, s4, 16 -; VI-NEXT: v_alignbit_b32 v32, v33, v32, 16 -; VI-NEXT: v_add_f32_e32 v33, s5, v31 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_add_f32_e32 v26, s5, v25 +; VI-NEXT: v_readfirstlane_b32 s5, v26 +; VI-NEXT: s_bfe_u32 s7, s5, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s5 +; VI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 +; VI-NEXT: s_addk_i32 s7, 0x7fff +; VI-NEXT: s_bitset1_b32 s5, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[23:24] +; VI-NEXT: s_cselect_b32 s14, s5, s7 ; VI-NEXT: s_and_b32 s4, s4, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; VI-NEXT: v_add_f32_e32 v31, s4, v31 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v31 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v34, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v33, 16 -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v24, v22 +; VI-NEXT: v_add_f32_e32 v25, s4, v25 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_readfirstlane_b32 s4, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[23:24] +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] +; VI-NEXT: v_mov_b32_e32 v21, v19 +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[20:21] +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_mov_b32_e32 v18, v16 +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[17:18] +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[14:15] +; VI-NEXT: v_mov_b32_e32 v15, v13 +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: s_bfe_u32 s5, s4, 0x10010 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[14:15] +; VI-NEXT: v_lshrrev_b64 v[11:12], 16, v[11:12] +; VI-NEXT: s_add_i32 s5, s5, s4 +; VI-NEXT: v_mov_b32_e32 v12, v10 +; VI-NEXT: s_add_i32 s7, s5, 0x7fff +; VI-NEXT: s_or_b32 s9, s4, 0x400000 +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12] +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[8:9] +; VI-NEXT: s_cselect_b32 s4, s9, s7 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[5:6], 16, v[5:6] +; VI-NEXT: v_mov_b32_e32 v9, v7 +; VI-NEXT: s_lshr_b32 s15, s4, 16 +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_lshr_b64 s[4:5], s[14:15], 16 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[8:9] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_mov_b32 s27, s44 +; VI-NEXT: s_mov_b32 s29, s58 +; VI-NEXT: s_mov_b32 s5, s62 +; VI-NEXT: v_lshrrev_b64 v[30:31], 24, v[5:6] +; VI-NEXT: s_mov_b32 s17, s6 +; VI-NEXT: s_mov_b32 s19, s8 +; VI-NEXT: s_mov_b32 s21, s10 +; VI-NEXT: s_mov_b32 s23, s12 +; VI-NEXT: s_mov_b32 s25, s40 +; VI-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[26:27], 24 +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[2:3] +; VI-NEXT: s_lshr_b64 s[36:37], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[50:51], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[52:53], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s11, s62, 24 +; VI-NEXT: s_lshr_b32 s13, s62, 16 +; VI-NEXT: s_lshr_b32 s14, s62, 8 +; VI-NEXT: s_lshr_b32 s15, s4, 16 +; VI-NEXT: s_lshr_b32 s17, s4, 8 +; VI-NEXT: s_lshr_b32 s19, s58, 24 +; VI-NEXT: s_lshr_b32 s21, s58, 16 +; VI-NEXT: s_lshr_b32 s23, s58, 8 +; VI-NEXT: s_lshr_b32 s25, s28, 16 +; VI-NEXT: s_lshr_b32 s27, s28, 8 +; VI-NEXT: s_lshr_b32 s29, s44, 24 +; VI-NEXT: s_lshr_b32 s41, s44, 16 +; VI-NEXT: s_lshr_b32 s42, s44, 8 +; VI-NEXT: s_lshr_b32 s43, s26, 16 +; VI-NEXT: s_lshr_b32 s45, s26, 8 +; VI-NEXT: s_lshr_b32 s46, s40, 24 +; VI-NEXT: s_lshr_b32 s47, s40, 16 +; VI-NEXT: s_lshr_b32 s56, s40, 8 +; VI-NEXT: s_lshr_b32 s57, s24, 16 +; VI-NEXT: s_lshr_b32 s59, s24, 8 +; VI-NEXT: s_lshr_b32 s60, s12, 24 +; VI-NEXT: s_lshr_b32 s61, s12, 16 +; VI-NEXT: s_lshr_b32 s63, s12, 8 +; VI-NEXT: s_lshr_b32 s72, s22, 16 +; VI-NEXT: s_lshr_b32 s73, s22, 8 +; VI-NEXT: s_lshr_b32 s75, s10, 24 +; VI-NEXT: s_lshr_b32 s76, s10, 16 +; VI-NEXT: s_lshr_b32 s77, s10, 8 +; VI-NEXT: s_lshr_b32 s79, s20, 16 +; VI-NEXT: s_lshr_b32 s88, s20, 8 +; VI-NEXT: s_lshr_b32 s89, s8, 24 +; VI-NEXT: s_lshr_b32 s90, s8, 16 +; VI-NEXT: s_lshr_b32 s91, s8, 8 +; VI-NEXT: s_lshr_b32 s31, s18, 16 +; VI-NEXT: s_lshr_b32 s34, s18, 8 +; VI-NEXT: s_lshr_b32 vcc_lo, s6, 24 +; VI-NEXT: s_lshr_b32 vcc_hi, s6, 16 +; VI-NEXT: s_lshr_b32 s35, s6, 8 +; VI-NEXT: s_lshr_b32 s9, s16, 16 +; VI-NEXT: s_lshr_b32 s7, s16, 8 +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v22 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v22 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v23 +; VI-NEXT: v_lshrrev_b32_e32 v18, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v19 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v2 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v8 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v2 -; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v16 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v13 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v7 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v12 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v16 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v48, 24, v13 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v53, 24, v10 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v10 -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v9 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v3 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v1 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v47, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v60, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v2 ; VI-NEXT: s_branch .LBB91_5 ; VI-NEXT: .LBB91_3: +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr7 +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 ; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr71 -; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr80 ; VI-NEXT: ; implicit-def: $sgpr70 ; VI-NEXT: ; implicit-def: $sgpr68 -; VI-NEXT: ; implicit-def: $sgpr67 ; VI-NEXT: ; implicit-def: $sgpr66 ; VI-NEXT: ; implicit-def: $sgpr64 -; VI-NEXT: ; implicit-def: $sgpr65 -; VI-NEXT: ; implicit-def: $sgpr55 ; VI-NEXT: ; implicit-def: $sgpr54 -; VI-NEXT: ; implicit-def: $sgpr53 -; VI-NEXT: ; implicit-def: $sgpr51 ; VI-NEXT: ; implicit-def: $sgpr52 ; VI-NEXT: ; implicit-def: $sgpr50 -; VI-NEXT: ; implicit-def: $sgpr87 -; VI-NEXT: ; implicit-def: $sgpr86 -; VI-NEXT: ; implicit-def: $sgpr84 -; VI-NEXT: ; implicit-def: $sgpr85 -; VI-NEXT: ; implicit-def: $sgpr83 -; VI-NEXT: ; implicit-def: $sgpr82 -; VI-NEXT: ; implicit-def: $sgpr81 -; VI-NEXT: ; implicit-def: $sgpr80 -; VI-NEXT: ; implicit-def: $sgpr76 -; VI-NEXT: ; implicit-def: $sgpr74 -; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr62 -; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: v_writelane_b32 v61, s6, 0 +; VI-NEXT: v_writelane_b32 v61, s7, 1 +; VI-NEXT: v_writelane_b32 v61, s8, 2 +; VI-NEXT: v_writelane_b32 v61, s9, 3 +; VI-NEXT: v_writelane_b32 v61, s10, 4 +; VI-NEXT: v_writelane_b32 v61, s11, 5 +; VI-NEXT: v_writelane_b32 v61, s12, 6 +; VI-NEXT: v_writelane_b32 v61, s13, 7 +; VI-NEXT: v_writelane_b32 v61, s40, 8 +; VI-NEXT: v_writelane_b32 v61, s41, 9 +; VI-NEXT: v_writelane_b32 v61, s44, 10 +; VI-NEXT: v_writelane_b32 v61, s45, 11 +; VI-NEXT: v_writelane_b32 v61, s58, 12 +; VI-NEXT: v_writelane_b32 v61, s59, 13 +; VI-NEXT: v_writelane_b32 v61, s62, 14 +; VI-NEXT: v_writelane_b32 v61, s63, 15 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr44 ; VI-NEXT: ; implicit-def: $sgpr58 -; VI-NEXT: ; implicit-def: $sgpr56 -; VI-NEXT: ; implicit-def: $sgpr48 -; VI-NEXT: ; implicit-def: $sgpr38 -; VI-NEXT: ; implicit-def: $sgpr36 -; VI-NEXT: ; implicit-def: $sgpr34 -; VI-NEXT: ; implicit-def: $sgpr30 -; VI-NEXT: ; implicit-def: $sgpr90 -; VI-NEXT: ; implicit-def: $sgpr88 -; VI-NEXT: ; implicit-def: $sgpr78 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr62 ; VI-NEXT: s_branch .LBB91_2 ; VI-NEXT: .LBB91_4: -; VI-NEXT: v_mov_b32_e32 v33, s71 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s69 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s70 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s68 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s67 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s86 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s83 -; VI-NEXT: v_mov_b32_e32 v31, s4 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s82 -; VI-NEXT: v_readlane_b32 s4, v62, 0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 1 -; VI-NEXT: v_mov_b32_e32 v40, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 2 -; VI-NEXT: v_mov_b32_e32 v44, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 3 -; VI-NEXT: v_mov_b32_e32 v54, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 4 -; VI-NEXT: v_mov_b32_e32 v53, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 5 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 6 -; VI-NEXT: v_mov_b32_e32 v51, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 7 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 8 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 9 -; VI-NEXT: v_mov_b32_e32 v56, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 10 -; VI-NEXT: v_mov_b32_e32 v47, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 11 -; VI-NEXT: v_mov_b32_e32 v48, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 12 -; VI-NEXT: v_mov_b32_e32 v43, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 13 -; VI-NEXT: v_mov_b32_e32 v46, s4 -; VI-NEXT: v_mov_b32_e32 v45, s72 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v45, s74 -; VI-NEXT: v_mov_b32_e32 v42, s54 -; VI-NEXT: v_mov_b32_e32 v41, s46 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v41, s56 -; VI-NEXT: v_readlane_b32 s4, v62, 14 -; VI-NEXT: v_mov_b32_e32 v50, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 15 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 16 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 17 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 19 -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 20 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 21 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 22 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v45, s76 -; VI-NEXT: v_readlane_b32 s4, v62, 23 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 24 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 25 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 26 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 27 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 28 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 29 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 30 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 31 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 32 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 33 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 34 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 35 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 36 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 37 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_readlane_b32 s4, v62, 38 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 39 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 40 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 41 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 42 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 43 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 44 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 45 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 46 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 47 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 48 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 49 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 50 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 51 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 52 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 53 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 54 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 55 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 56 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 57 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_mov_b32_e32 v36, s66 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v45, s78 -; VI-NEXT: v_mov_b32_e32 v55, s88 -; VI-NEXT: v_mov_b32_e32 v35, s30 -; VI-NEXT: v_mov_b32_e32 v41, s58 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s85 -; VI-NEXT: v_mov_b32_e32 v34, s38 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v52, s64 -; VI-NEXT: v_mov_b32_e32 v59, s87 -; VI-NEXT: v_mov_b32_e32 v41, s60 -; VI-NEXT: v_mov_b32_e32 v55, v50 -; VI-NEXT: v_mov_b32_e32 v58, s34 -; VI-NEXT: v_mov_b32_e32 v45, s36 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v50, v46 -; VI-NEXT: v_mov_b32_e32 v46, v48 -; VI-NEXT: v_mov_b32_e32 v48, v47 -; VI-NEXT: v_mov_b32_e32 v47, v56 -; VI-NEXT: v_mov_b32_e32 v56, v51 -; VI-NEXT: v_mov_b32_e32 v51, s90 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v34, s48 -; VI-NEXT: v_mov_b32_e32 v1, s44 -; VI-NEXT: v_mov_b32_e32 v2, s45 -; VI-NEXT: v_mov_b32_e32 v3, s42 -; VI-NEXT: v_mov_b32_e32 v4, s43 -; VI-NEXT: v_mov_b32_e32 v5, s40 -; VI-NEXT: v_mov_b32_e32 v6, s41 -; VI-NEXT: v_mov_b32_e32 v7, s14 -; VI-NEXT: v_mov_b32_e32 v8, s15 -; VI-NEXT: v_mov_b32_e32 v9, s12 -; VI-NEXT: v_mov_b32_e32 v10, s13 -; VI-NEXT: v_mov_b32_e32 v11, s10 -; VI-NEXT: v_mov_b32_e32 v12, s11 -; VI-NEXT: v_mov_b32_e32 v13, s8 -; VI-NEXT: v_mov_b32_e32 v14, s9 -; VI-NEXT: v_mov_b32_e32 v15, s6 -; VI-NEXT: v_mov_b32_e32 v16, s7 -; VI-NEXT: v_mov_b32_e32 v17, s16 -; VI-NEXT: v_mov_b32_e32 v18, s17 -; VI-NEXT: v_mov_b32_e32 v19, s18 -; VI-NEXT: v_mov_b32_e32 v20, s19 -; VI-NEXT: v_mov_b32_e32 v21, s20 -; VI-NEXT: v_mov_b32_e32 v22, s21 -; VI-NEXT: v_mov_b32_e32 v23, s22 -; VI-NEXT: v_mov_b32_e32 v24, s23 -; VI-NEXT: v_mov_b32_e32 v25, s24 -; VI-NEXT: v_mov_b32_e32 v26, s25 -; VI-NEXT: v_mov_b32_e32 v27, s26 -; VI-NEXT: v_mov_b32_e32 v28, s27 -; VI-NEXT: v_mov_b32_e32 v29, s28 -; VI-NEXT: v_mov_b32_e32 v30, s29 -; VI-NEXT: v_mov_b32_e32 v32, s5 -; VI-NEXT: v_mov_b32_e32 v41, s62 -; VI-NEXT: v_mov_b32_e32 v51, v53 -; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: v_mov_b32_e32 v54, v40 -; VI-NEXT: v_mov_b32_e32 v40, s80 -; VI-NEXT: v_mov_b32_e32 v57, s81 -; VI-NEXT: v_mov_b32_e32 v37, s84 -; VI-NEXT: v_mov_b32_e32 v58, s50 -; VI-NEXT: v_mov_b32_e32 v60, s52 -; VI-NEXT: v_mov_b32_e32 v38, s51 -; VI-NEXT: v_mov_b32_e32 v61, s65 -; VI-NEXT: v_mov_b32_e32 v49, s66 -; VI-NEXT: v_mov_b32_e32 v45, s53 -; VI-NEXT: v_mov_b32_e32 v39, s55 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v28, s50 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v28, s52 +; VI-NEXT: v_readlane_b32 s5, v61, 16 +; VI-NEXT: v_mov_b32_e32 v57, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 17 +; VI-NEXT: v_mov_b32_e32 v59, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 18 +; VI-NEXT: v_mov_b32_e32 v47, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 19 +; VI-NEXT: v_mov_b32_e32 v56, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 20 +; VI-NEXT: v_mov_b32_e32 v58, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 21 +; VI-NEXT: v_mov_b32_e32 v25, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 22 +; VI-NEXT: v_mov_b32_e32 v27, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 23 +; VI-NEXT: v_mov_b32_e32 v38, s79 +; VI-NEXT: v_mov_b32_e32 v39, s78 +; VI-NEXT: v_mov_b32_e32 v35, s75 +; VI-NEXT: v_mov_b32_e32 v36, s74 +; VI-NEXT: v_mov_b32_e32 v60, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 24 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v13, s31 +; VI-NEXT: v_readlane_b32 s74, v61, 14 +; VI-NEXT: v_readlane_b32 s78, v61, 12 +; VI-NEXT: v_readlane_b32 s30, v61, 10 +; VI-NEXT: v_mov_b32_e32 v24, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 25 +; VI-NEXT: v_mov_b32_e32 v8, s36 +; VI-NEXT: v_mov_b32_e32 v7, s37 +; VI-NEXT: v_mov_b32_e32 v5, s38 +; VI-NEXT: v_mov_b32_e32 v4, s39 +; VI-NEXT: v_mov_b32_e32 v2, s48 +; VI-NEXT: v_mov_b32_e32 v1, s49 +; VI-NEXT: v_readlane_b32 s75, v61, 15 +; VI-NEXT: v_readlane_b32 s79, v61, 13 +; VI-NEXT: v_readlane_b32 s31, v61, 11 +; VI-NEXT: v_readlane_b32 s36, v61, 8 +; VI-NEXT: v_readlane_b32 s38, v61, 6 +; VI-NEXT: v_readlane_b32 s48, v61, 4 +; VI-NEXT: v_readlane_b32 s50, v61, 2 +; VI-NEXT: v_readlane_b32 s52, v61, 0 +; VI-NEXT: v_mov_b32_e32 v12, s13 +; VI-NEXT: v_mov_b32_e32 v15, s11 +; VI-NEXT: v_mov_b32_e32 v3, s87 +; VI-NEXT: v_mov_b32_e32 v6, s43 +; VI-NEXT: v_mov_b32_e32 v9, s42 +; VI-NEXT: v_mov_b32_e32 v33, s9 +; VI-NEXT: v_mov_b32_e32 v34, s7 +; VI-NEXT: v_mov_b32_e32 v18, s86 +; VI-NEXT: v_mov_b32_e32 v21, s85 +; VI-NEXT: v_mov_b32_e32 v32, s84 +; VI-NEXT: v_mov_b32_e32 v37, s73 +; VI-NEXT: v_mov_b32_e32 v51, s72 +; VI-NEXT: v_mov_b32_e32 v52, s61 +; VI-NEXT: v_mov_b32_e32 v48, s60 +; VI-NEXT: v_mov_b32_e32 v49, s15 +; VI-NEXT: v_mov_b32_e32 v50, s14 +; VI-NEXT: v_mov_b32_e32 v40, s83 +; VI-NEXT: v_mov_b32_e32 v41, s82 +; VI-NEXT: v_mov_b32_e32 v53, s63 +; VI-NEXT: v_mov_b32_e32 v54, s57 +; VI-NEXT: v_mov_b32_e32 v55, s56 +; VI-NEXT: v_mov_b32_e32 v45, s59 +; VI-NEXT: v_mov_b32_e32 v46, s45 +; VI-NEXT: v_mov_b32_e32 v42, s41 +; VI-NEXT: v_mov_b32_e32 v43, s47 +; VI-NEXT: v_mov_b32_e32 v44, s46 +; VI-NEXT: v_mov_b32_e32 v26, s5 +; VI-NEXT: v_mov_b32_e32 v23, s76 +; VI-NEXT: v_mov_b32_e32 v22, s77 +; VI-NEXT: v_mov_b32_e32 v20, s88 +; VI-NEXT: v_mov_b32_e32 v19, s89 +; VI-NEXT: v_mov_b32_e32 v17, s90 +; VI-NEXT: v_mov_b32_e32 v16, s91 +; VI-NEXT: v_mov_b32_e32 v11, s34 +; VI-NEXT: v_mov_b32_e32 v10, s35 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v28, s54 +; VI-NEXT: v_mov_b32_e32 v30, s70 +; VI-NEXT: v_mov_b32_e32 v31, s80 +; VI-NEXT: v_readlane_b32 s11, v61, 26 +; VI-NEXT: v_readlane_b32 s13, v61, 27 +; VI-NEXT: v_readlane_b32 s14, v61, 28 +; VI-NEXT: v_readlane_b32 s15, v61, 29 +; VI-NEXT: v_readlane_b32 s17, v61, 30 +; VI-NEXT: v_readlane_b32 s19, v61, 31 +; VI-NEXT: v_readlane_b32 s21, v61, 32 +; VI-NEXT: v_readlane_b32 s23, v61, 33 +; VI-NEXT: v_readlane_b32 s25, v61, 34 +; VI-NEXT: v_readlane_b32 s27, v61, 35 +; VI-NEXT: v_readlane_b32 s29, v61, 36 +; VI-NEXT: v_readlane_b32 s41, v61, 37 +; VI-NEXT: v_readlane_b32 s42, v61, 38 +; VI-NEXT: v_readlane_b32 s43, v61, 39 +; VI-NEXT: v_readlane_b32 s45, v61, 40 +; VI-NEXT: v_readlane_b32 s46, v61, 41 +; VI-NEXT: v_readlane_b32 s47, v61, 42 +; VI-NEXT: v_readlane_b32 s56, v61, 43 +; VI-NEXT: v_readlane_b32 s57, v61, 44 +; VI-NEXT: v_readlane_b32 s59, v61, 45 +; VI-NEXT: v_readlane_b32 s60, v61, 46 +; VI-NEXT: v_readlane_b32 s61, v61, 47 +; VI-NEXT: v_readlane_b32 s63, v61, 48 +; VI-NEXT: v_readlane_b32 s72, v61, 49 +; VI-NEXT: v_readlane_b32 s73, v61, 50 +; VI-NEXT: v_readlane_b32 s75, v61, 51 +; VI-NEXT: v_readlane_b32 s76, v61, 52 +; VI-NEXT: v_readlane_b32 s77, v61, 53 +; VI-NEXT: v_readlane_b32 s79, v61, 54 +; VI-NEXT: v_readlane_b32 s88, v61, 55 +; VI-NEXT: v_readlane_b32 s89, v61, 56 +; VI-NEXT: v_readlane_b32 s90, v61, 57 +; VI-NEXT: v_readlane_b32 s91, v61, 58 +; VI-NEXT: v_readlane_b32 s31, v61, 59 +; VI-NEXT: v_readlane_b32 s34, v61, 60 +; VI-NEXT: v_readlane_b32 s37, v61, 9 +; VI-NEXT: v_readlane_b32 vcc_lo, v61, 61 +; VI-NEXT: v_readlane_b32 vcc_hi, v61, 62 +; VI-NEXT: v_readlane_b32 s35, v61, 63 +; VI-NEXT: v_readlane_b32 s9, v62, 0 +; VI-NEXT: v_readlane_b32 s7, v62, 1 +; VI-NEXT: v_readlane_b32 s39, v61, 7 +; VI-NEXT: v_readlane_b32 s49, v61, 5 +; VI-NEXT: v_readlane_b32 s51, v61, 3 +; VI-NEXT: v_readlane_b32 s53, v61, 1 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v28, s64 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v28, s66 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v28, s68 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: .LBB91_5: ; %end -; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v17, v17, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s5, s16, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s9, 0xff +; VI-NEXT: s_lshl_b32 s9, s52, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_mov_b32_e32 v28, s5 +; VI-NEXT: s_and_b32 s5, s6, 0xff +; VI-NEXT: s_lshl_b32 s6, s35, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, vcc_hi, 0xff +; VI-NEXT: s_lshl_b32 s7, vcc_lo, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s34, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s31, 0xff +; VI-NEXT: s_lshl_b32 s7, s50, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v28, vcc, 4, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s8, 0xff +; VI-NEXT: s_lshl_b32 s6, s91, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s90, 0xff +; VI-NEXT: s_lshl_b32 s7, s89, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 8, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s88, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s79, 0xff +; VI-NEXT: s_lshl_b32 s7, s48, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 12, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s10, 0xff +; VI-NEXT: s_lshl_b32 s6, s77, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s76, 0xff +; VI-NEXT: s_lshl_b32 s7, s75, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 16, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff +; VI-NEXT: s_lshl_b32 s6, s73, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s72, 0xff +; VI-NEXT: s_lshl_b32 s7, s38, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 20, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s12, 0xff +; VI-NEXT: s_lshl_b32 s6, s63, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s61, 0xff +; VI-NEXT: s_lshl_b32 s7, s60, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 24, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s24, 0xff +; VI-NEXT: s_lshl_b32 s6, s59, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s57, 0xff +; VI-NEXT: s_lshl_b32 s7, s36, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 28, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s40, 0xff +; VI-NEXT: s_lshl_b32 s6, s56, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s47, 0xff +; VI-NEXT: s_lshl_b32 s7, s46, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 32, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s45, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s43, 0xff +; VI-NEXT: s_lshl_b32 s7, s30, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 36, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s44, 0xff +; VI-NEXT: s_lshl_b32 s6, s42, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s41, 0xff +; VI-NEXT: s_lshl_b32 s7, s29, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 40, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s25, 0xff +; VI-NEXT: s_lshl_b32 s7, s78, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 44, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s58, 0xff +; VI-NEXT: s_lshl_b32 s6, s23, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s21, 0xff +; VI-NEXT: s_lshl_b32 s7, s19, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 48, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s15, 0xff +; VI-NEXT: s_lshl_b32 s6, s74, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 52, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s4 +; VI-NEXT: s_and_b32 s4, s62, 0xff +; VI-NEXT: s_lshl_b32 s5, s14, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s13, 0xff +; VI-NEXT: s_lshl_b32 s6, s11, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v31 +; VI-NEXT: v_add_u32_e32 v28, vcc, 56, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_or_b32_sdwa v25, v25, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v28, vcc, 60, v0 +; VI-NEXT: v_mov_b32_e32 v29, s4 +; VI-NEXT: v_or_b32_sdwa v2, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v25, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; VI-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v59 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v30 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v58 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v47 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v46 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_readlane_b32 s87, v63, 31 ; VI-NEXT: v_readlane_b32 s86, v63, 30 ; VI-NEXT: v_readlane_b32 s85, v63, 29 @@ -170050,393 +171218,121 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_readlane_b32 s34, v63, 2 ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v34, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v33, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 4, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 8, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 12, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v21, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 16, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 20, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v41 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v23, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 24, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 28, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v25, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 32, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 36, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v27, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 40, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 44, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 48, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v30, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 52, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v31, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 56, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v32, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 60, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v17, vcc, 64, v0 -; VI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v42 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v45 -; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v60 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v59 -; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v42 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v53 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v57 -; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v40 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 -; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v48 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 -; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v21, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_or_b32_sdwa v2, v12, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -224735,19 +225631,19 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_writelane_b32 v42, s31, 1 ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 -; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v13 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v9 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v5 +; VI-NEXT: v_mov_b32_e32 v38, v3 ; VI-NEXT: v_mov_b32_e32 v28, v14 -; VI-NEXT: v_mov_b32_e32 v27, v13 ; VI-NEXT: v_mov_b32_e32 v26, v12 -; VI-NEXT: v_mov_b32_e32 v25, v11 ; VI-NEXT: v_mov_b32_e32 v24, v10 -; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 -; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 -; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v48, v4 ; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec @@ -224758,583 +225654,595 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB101_4 ; VI-NEXT: .LBB101_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v17, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v17 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_lshl_b32 s4, s30, 16 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_f32_e32 v2, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_alignbit_b32 v15, v4, v3, 16 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[2:3] +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v1, s4, v17 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v17 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v17 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v18, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v18, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v2 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v17 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v49, v5, v7, vcc +; VI-NEXT: v_add_f32_e32 v5, s4, v17 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_add_f32_e32 v7, s4, v17 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v17 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_mov_b32_e32 v1, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_add_f32_e32 v7, s4, v17 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v3, v49 +; VI-NEXT: v_cndmask_b32_e32 v49, v9, v11, vcc +; VI-NEXT: v_add_f32_e32 v9, s4, v17 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_add_f32_e32 v11, s4, v17 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; VI-NEXT: v_add_f32_e32 v9, s4, v17 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; VI-NEXT: v_mov_b32_e32 v5, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_add_f32_e32 v11, s4, v17 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v7, v49 +; VI-NEXT: v_cndmask_b32_e32 v49, v13, v15, vcc +; VI-NEXT: v_add_f32_e32 v13, s4, v17 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; VI-NEXT: v_mov_b32_e32 v9, v18 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v17 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v18, v33, vcc -; VI-NEXT: v_add_f32_e32 v18, s4, v0 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v18, v1, 16 -; VI-NEXT: v_add_f32_e32 v18, s4, v0 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v17 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v17 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_mov_b32_e32 v11, v49 +; VI-NEXT: v_cndmask_b32_e32 v49, v17, v19, vcc +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v16 ; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_mov_b32_e32 v13, v18 +; VI-NEXT: v_bfe_u32 v18, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v17 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; VI-NEXT: v_bfe_u32 v35, v34, 16, 1 -; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v34 -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; VI-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc -; VI-NEXT: v_bfe_u32 v35, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v32 -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 -; VI-NEXT: v_or_b32_e32 v36, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v35, v36, vcc -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 -; VI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; VI-NEXT: v_bfe_u32 v36, v35, 16, 1 -; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v35 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v18 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v38 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v19, v21, vcc +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc -; VI-NEXT: v_bfe_u32 v36, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v19 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 -; VI-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 -; VI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; VI-NEXT: v_bfe_u32 v37, v36, 16, 1 -; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v36 -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v48 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v38, v21, v23, vcc +; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v48 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v19, v21, v23, vcc +; VI-NEXT: v_bfe_u32 v21, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v18 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v21, v23, vcc +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v37 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_bfe_u32 v23, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v21 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v23, v25, vcc +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v37 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 +; VI-NEXT: v_mov_b32_e32 v15, v49 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v20 ; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc -; VI-NEXT: v_bfe_u32 v37, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v20 -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 -; VI-NEXT: v_or_b32_e32 v38, 0x400000, v20 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_bfe_u32 v23, v20, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v25, v27, vcc +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v20 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; VI-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; VI-NEXT: v_bfe_u32 v38, v37, 16, 1 -; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v37 -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc -; VI-NEXT: v_bfe_u32 v38, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v21 -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 -; VI-NEXT: v_or_b32_e32 v39, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v22 -; VI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; VI-NEXT: v_bfe_u32 v39, v38, 16, 1 -; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v38 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v21 +; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v23, v25, vcc +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v21, v25, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v23 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v36 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v25, v27, vcc +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v36 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v22 ; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v48, 0x400000, v38 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc -; VI-NEXT: v_bfe_u32 v39, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v22 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 -; VI-NEXT: v_or_b32_e32 v48, 0x400000, v22 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_bfe_u32 v25, v22, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v27, v29, vcc +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v22 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 -; VI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; VI-NEXT: v_bfe_u32 v48, v39, 16, 1 -; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v39 -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v49, 0x400000, v39 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc -; VI-NEXT: v_bfe_u32 v48, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v23 -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 -; VI-NEXT: v_or_b32_e32 v49, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 -; VI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; VI-NEXT: v_bfe_u32 v49, v48, 16, 1 -; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v48 -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v23 +; VI-NEXT: v_bfe_u32 v23, v22, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v25, v25, v27, vcc +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v22 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v23, v27, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v35 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v27, v29, vcc +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v24 ; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v50, 0x400000, v48 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc -; VI-NEXT: v_bfe_u32 v49, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v24 -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 -; VI-NEXT: v_or_b32_e32 v50, 0x400000, v24 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_bfe_u32 v27, v24, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v29, v35, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v24 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v25 -; VI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 -; VI-NEXT: v_bfe_u32 v50, v49, 16, 1 -; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v49 -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v51, 0x400000, v49 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc -; VI-NEXT: v_bfe_u32 v50, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v25 -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 -; VI-NEXT: v_or_b32_e32 v51, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v26 -; VI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 -; VI-NEXT: v_bfe_u32 v51, v50, 16, 1 -; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v50 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v25 +; VI-NEXT: v_bfe_u32 v25, v24, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v27, v27, v29, vcc +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v24 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v25, v29, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v27 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v29, v35, vcc +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v34 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_bfe_u32 v34, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v29 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v26 ; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc -; VI-NEXT: v_bfe_u32 v51, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v26 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 +; VI-NEXT: v_bfe_u32 v29, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v26 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 ; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 -; VI-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 -; VI-NEXT: v_bfe_u32 v52, v51, 16, 1 -; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v51 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v27 +; VI-NEXT: v_bfe_u32 v27, v26, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v29, v29, v52, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v26 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v27, v52, vcc +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v33 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_bfe_u32 v52, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v29 ; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v53, 0x400000, v51 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc -; VI-NEXT: v_bfe_u32 v52, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v27 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_cndmask_b32_e32 v29, v52, v53, vcc +; VI-NEXT: v_bfe_u32 v52, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v33 ; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 -; VI-NEXT: v_or_b32_e32 v53, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v28 -; VI-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 -; VI-NEXT: v_bfe_u32 v53, v52, 16, 1 -; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v52 -; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v28 ; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v54, 0x400000, v52 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc -; VI-NEXT: v_bfe_u32 v53, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v28 -; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 +; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v29 -; VI-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 -; VI-NEXT: v_bfe_u32 v54, v53, 16, 1 -; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v53 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v29 +; VI-NEXT: v_bfe_u32 v29, v28, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v33, v54, vcc +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v28 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 +; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v29, v54, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v33 +; VI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v54, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v33 ; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v55, 0x400000, v53 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc -; VI-NEXT: v_bfe_u32 v54, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v29 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_cndmask_b32_e32 v33, v54, v55, vcc +; VI-NEXT: v_bfe_u32 v54, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v32 ; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 -; VI-NEXT: v_or_b32_e32 v55, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_cndmask_b32_e32 v32, v54, v55, vcc ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v30 -; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 -; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 -; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 -; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 ; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc ; VI-NEXT: v_bfe_u32 v55, v30, 16, 1 ; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v30 ; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 ; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 ; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 -; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 -; VI-NEXT: v_bfe_u32 v40, v55, 16, 1 -; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v55 -; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v41, 0x400000, v55 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc -; VI-NEXT: v_bfe_u32 v40, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v31 +; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 +; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 +; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 +; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 +; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_bfe_u32 v40, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v30 ; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 -; VI-NEXT: v_or_b32_e32 v41, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_bfe_u32 v31, v30, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[48:49] +; VI-NEXT: v_cndmask_b32_e32 v40, v40, v41, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30 +; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[36:37] +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 +; VI-NEXT: v_mov_b32_e32 v37, v48 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[50:51] +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[34:35] +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_cndmask_b32_e32 v30, v31, v41, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v40 +; VI-NEXT: v_mov_b32_e32 v35, v48 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[52:53] ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v31, v31, v55, 16 -; VI-NEXT: v_alignbit_b32 v30, v30, v54, 16 -; VI-NEXT: v_alignbit_b32 v29, v29, v53, 16 -; VI-NEXT: v_alignbit_b32 v28, v28, v52, 16 -; VI-NEXT: v_alignbit_b32 v27, v27, v51, 16 -; VI-NEXT: v_alignbit_b32 v26, v26, v50, 16 -; VI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; VI-NEXT: v_alignbit_b32 v24, v24, v48, 16 -; VI-NEXT: v_alignbit_b32 v23, v23, v39, 16 -; VI-NEXT: v_alignbit_b32 v22, v22, v38, 16 -; VI-NEXT: v_alignbit_b32 v21, v21, v37, 16 -; VI-NEXT: v_alignbit_b32 v20, v20, v36, 16 -; VI-NEXT: v_alignbit_b32 v19, v19, v35, 16 -; VI-NEXT: v_alignbit_b32 v32, v32, v34, 16 -; VI-NEXT: v_alignbit_b32 v17, v17, v33, 16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[50:51], 16, v[30:31] +; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[38:39] +; VI-NEXT: v_mov_b32_e32 v33, v48 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[28:29] +; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[26:27] +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[24:25] +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] +; VI-NEXT: v_mov_b32_e32 v31, v50 ; VI-NEXT: s_branch .LBB101_5 ; VI-NEXT: .LBB101_3: ; VI-NEXT: s_branch .LBB101_2 @@ -225358,7 +226266,14 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: .LBB101_5: ; %end ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_mov_b32_e32 v17, v38 +; VI-NEXT: v_mov_b32_e32 v18, v48 +; VI-NEXT: v_mov_b32_e32 v19, v37 +; VI-NEXT: v_mov_b32_e32 v21, v36 +; VI-NEXT: v_mov_b32_e32 v23, v35 +; VI-NEXT: v_mov_b32_e32 v25, v34 +; VI-NEXT: v_mov_b32_e32 v27, v33 +; VI-NEXT: v_mov_b32_e32 v29, v32 ; VI-NEXT: v_readlane_b32 s31, v42, 1 ; VI-NEXT: v_readlane_b32 s30, v42, 0 ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 @@ -228975,1105 +229890,1194 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v37, v0 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_mov_b32_e32 v46, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s27 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v29, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 -; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v54, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v55, v50 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v51, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v52, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB103_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_mov_b32_e32 v23, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v18, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v31 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v46 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_mov_b32_e32 v51, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_mov_b32_e32 v36, v22 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v44 -; SI-NEXT: v_mov_b32_e32 v50, v26 -; SI-NEXT: v_mov_b32_e32 v33, v28 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 -; SI-NEXT: v_mov_b32_e32 v38, v7 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v62 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v38 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v52 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v58 -; SI-NEXT: v_mov_b32_e32 v58, v5 -; SI-NEXT: v_mov_b32_e32 v59, v11 -; SI-NEXT: v_mov_b32_e32 v60, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_mov_b32_e32 v5, v23 -; SI-NEXT: v_mov_b32_e32 v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v53 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v35 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_mov_b32_e32 v38, v54 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v59 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_mov_b32_e32 v11, v32 +; SI-NEXT: v_mov_b32_e32 v27, v26 +; SI-NEXT: v_mov_b32_e32 v25, v28 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v32, v50 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v48, v55 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: v_mov_b32_e32 v50, v20 ; SI-NEXT: s_branch .LBB103_3 ; SI-NEXT: .LBB103_2: -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: v_mov_b32_e32 v33, v28 -; SI-NEXT: v_mov_b32_e32 v50, v26 -; SI-NEXT: v_mov_b32_e32 v36, v22 -; SI-NEXT: v_mov_b32_e32 v51, v21 +; SI-NEXT: v_mov_b32_e32 v11, v32 +; SI-NEXT: v_mov_b32_e32 v32, v50 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v38, v54 +; SI-NEXT: v_mov_b32_e32 v18, v20 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: v_mov_b32_e32 v5, v6 +; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v27, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v25, v28 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v48, v55 +; SI-NEXT: v_mov_b32_e32 v49, v40 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: .LBB103_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v19, v20 -; SI-NEXT: v_mov_b32_e32 v6, v27 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v61, v2 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v51, v2 ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB103_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v62 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v46 +; SI-NEXT: v_mov_b32_e32 v28, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v47 +; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v43 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v42 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v33 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v50 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v52 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v40, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v59 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v51 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v10 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v33 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v57 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v56 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v50, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v51, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v40, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 ; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v43, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v47, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v56, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v53, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v45, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v23 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v23 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v49 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v23 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v63 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v57 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v61 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v59, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v59 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v60, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 ; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v60 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v19, v60 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v11, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v41 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v11, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v16 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v19 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v46 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v41 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v36 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v16 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_mov_b32_e32 v17, v11 -; SI-NEXT: v_mov_b32_e32 v16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 -; SI-NEXT: v_mov_b32_e32 v3, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v35, v19 +; SI-NEXT: v_mov_b32_e32 v8, v14 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v4 -; SI-NEXT: v_mov_b32_e32 v4, v22 -; SI-NEXT: v_mov_b32_e32 v22, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_mov_b32_e32 v4, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: .LBB103_5: ; %end -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: buffer_store_dword v1, v37, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v37 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v37 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v37 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v37 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v37 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v37 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x7c, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload @@ -233838,1081 +234842,1388 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v57, v21 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; SI-NEXT: v_mov_b32_e32 v63, v23 +; SI-NEXT: v_mov_b32_e32 v46, v20 +; SI-NEXT: v_mov_b32_e32 v41, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v45, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v30 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v63 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v30 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s20 ; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v34 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v36 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v48 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v48 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v49 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v50 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v51 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v52 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v44 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v53 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v55 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v58 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s29 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v40 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v43 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v44 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s27 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB105_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v44, v53 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v42 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v35 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v62 +; SI-NEXT: v_mov_b32_e32 v35, v21 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v41 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_mov_b32_e32 v54, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v60 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v47 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v61 +; SI-NEXT: v_mov_b32_e32 v47, v45 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v36 +; SI-NEXT: v_mov_b32_e32 v60, v42 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v29 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v51 +; SI-NEXT: v_mov_b32_e32 v19, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v48 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_mov_b32_e32 v30, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v51, v56 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v57 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v45 +; SI-NEXT: v_mov_b32_e32 v45, v57 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v56 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v52 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_mov_b32_e32 v52, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v23 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v59 +; SI-NEXT: v_mov_b32_e32 v24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_mov_b32_e32 v11, v6 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v13 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v63 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v58 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v34 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v17 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v55 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_mov_b32_e32 v42, v4 +; SI-NEXT: v_mov_b32_e32 v15, v63 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v32 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v45 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v27 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v62 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_mov_b32_e32 v5, v18 +; SI-NEXT: v_mov_b32_e32 v6, v41 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_mov_b32_e32 v25, v1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v57, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34 -; SI-NEXT: v_mov_b32_e32 v40, v3 -; SI-NEXT: v_mov_b32_e32 v54, v50 -; SI-NEXT: v_mov_b32_e32 v46, v19 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; SI-NEXT: v_mov_b32_e32 v44, v15 -; SI-NEXT: v_mov_b32_e32 v9, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; SI-NEXT: v_mov_b32_e32 v41, v27 -; SI-NEXT: v_mov_b32_e32 v52, v62 -; SI-NEXT: v_mov_b32_e32 v21, v58 -; SI-NEXT: v_mov_b32_e32 v58, v20 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v42 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v53, v5 -; SI-NEXT: v_mov_b32_e32 v42, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v50 -; SI-NEXT: v_mov_b32_e32 v5, v19 -; SI-NEXT: v_mov_b32_e32 v7, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v43 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_mov_b32_e32 v2, v38 +; SI-NEXT: v_mov_b32_e32 v38, v31 +; SI-NEXT: s_branch .LBB105_3 +; SI-NEXT: .LBB105_2: +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v35, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v5 +; SI-NEXT: v_mov_b32_e32 v30, v20 +; SI-NEXT: v_mov_b32_e32 v52, v7 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 -; SI-NEXT: v_mov_b32_e32 v47, v3 -; SI-NEXT: v_mov_b32_e32 v3, v17 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_mov_b32_e32 v1, v13 -; SI-NEXT: s_branch .LBB105_3 -; SI-NEXT: .LBB105_2: -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v25, v1 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v21, v58 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v52, v62 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v50 -; SI-NEXT: v_mov_b32_e32 v9, v11 -; SI-NEXT: v_mov_b32_e32 v53, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v40, v3 -; SI-NEXT: v_mov_b32_e32 v44, v15 -; SI-NEXT: v_mov_b32_e32 v57, v13 -; SI-NEXT: v_mov_b32_e32 v46, v19 -; SI-NEXT: v_mov_b32_e32 v41, v27 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v42, v43 -; SI-NEXT: v_mov_b32_e32 v3, v17 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v24, v23 +; SI-NEXT: v_mov_b32_e32 v19, v18 +; SI-NEXT: v_mov_b32_e32 v51, v56 +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v47, v45 +; SI-NEXT: v_mov_b32_e32 v44, v53 +; SI-NEXT: v_mov_b32_e32 v11, v6 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, v38 +; SI-NEXT: v_mov_b32_e32 v60, v42 +; SI-NEXT: v_mov_b32_e32 v45, v57 +; SI-NEXT: v_mov_b32_e32 v38, v31 +; SI-NEXT: v_mov_b32_e32 v42, v4 +; SI-NEXT: v_mov_b32_e32 v6, v41 +; SI-NEXT: v_mov_b32_e32 v15, v63 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: .LBB105_3: ; %Flow -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v18, v3 +; SI-NEXT: v_mov_b32_e32 v57, v9 +; SI-NEXT: v_mov_b32_e32 v23, v1 ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v53, v47 +; SI-NEXT: v_mov_b32_e32 v47, v51 ; SI-NEXT: s_cbranch_vccnz .LBB105_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v31 -; SI-NEXT: v_mov_b32_e32 v38, v9 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_mov_b32_e32 v4, v52 +; SI-NEXT: v_mov_b32_e32 v52, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_mov_b32_e32 v59, v45 +; SI-NEXT: v_mov_b32_e32 v61, v46 +; SI-NEXT: v_mov_b32_e32 v46, v2 +; SI-NEXT: v_mov_b32_e32 v45, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_mov_b32_e32 v37, v42 +; SI-NEXT: v_mov_b32_e32 v42, v2 +; SI-NEXT: v_mov_b32_e32 v41, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_mov_b32_e32 v31, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_mov_b32_e32 v15, v44 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v62, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v57, 0x40c00000, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; SI-NEXT: v_mov_b32_e32 v10, v11 +; SI-NEXT: v_mov_b32_e32 v11, v24 +; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v55, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_mov_b32_e32 v12, v19 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mov_b32_e32 v59, v58 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v26 -; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v46 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v14 +; SI-NEXT: v_mov_b32_e32 v14, v26 +; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v56 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_alignbit_b32 v4, v6, v4, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v8, v6, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v10, v8, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v12, v10, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v14, v12, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v18, v14, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v20, v18, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v23 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v24, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_lshr_b64 v[48:49], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[7:8], 16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v26 -; SI-NEXT: v_alignbit_b32 v26, v59, v25, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: v_lshr_b64 v[51:52], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[1:2], 16 -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v45, v16, 16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v28, v58, v27, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v55, v20, 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[19:20], 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v29 -; SI-NEXT: v_alignbit_b32 v35, v43, v32, 16 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_lshr_b64 v[62:63], v[34:35], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_alignbit_b32 v39, v29, v32, 16 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[31:32], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[5:6], 16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[27:28], 16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[23:24], 16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[17:18], 16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[13:14], 16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[11:12], 16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshr_b64 v[33:34], v[33:34], 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v61 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[9:10], 16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshr_b64 v[33:34], v[33:34], 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshr_b64 v[33:34], v[33:34], 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v37 +; SI-NEXT: v_lshr_b64 v[33:34], v[33:34], 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[46:47], 16 +; SI-NEXT: v_mov_b32_e32 v61, v42 +; SI-NEXT: v_mov_b32_e32 v1, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v33 +; SI-NEXT: v_lshr_b64 v[33:34], v[48:49], 16 +; SI-NEXT: v_mov_b32_e32 v60, v41 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v1, v33 +; SI-NEXT: v_mov_b32_e32 v38, v33 +; SI-NEXT: v_lshr_b64 v[33:34], v[22:23], 16 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v37 +; SI-NEXT: v_mov_b32_e32 v40, v33 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v1, v33 +; SI-NEXT: v_lshr_b64 v[33:34], v[60:61], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, v33 +; SI-NEXT: v_lshr_b64 v[33:34], v[62:63], 16 +; SI-NEXT: v_mov_b32_e32 v62, v56 +; SI-NEXT: v_mov_b32_e32 v4, v33 +; SI-NEXT: v_lshr_b64 v[33:34], v[57:58], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v6, v33 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[33:34], v[29:30], 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v8, v33 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v42, v33 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v8, v44 +; SI-NEXT: v_lshr_b64 v[33:34], v[43:44], 16 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v10 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v33 +; SI-NEXT: v_lshr_b64 v[33:34], v[55:56], 16 +; SI-NEXT: v_mov_b32_e32 v55, v25 +; SI-NEXT: v_mov_b32_e32 v10, v33 +; SI-NEXT: v_mov_b32_e32 v44, v33 +; SI-NEXT: v_lshr_b64 v[33:34], v[27:28], 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v10, v33 +; SI-NEXT: v_lshr_b64 v[33:34], v[20:21], 16 +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v24 +; SI-NEXT: v_mov_b32_e32 v12, v33 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v46, v33 +; SI-NEXT: v_lshr_b64 v[33:34], v[31:32], 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v12, v33 +; SI-NEXT: v_lshr_b64 v[33:34], v[35:36], 16 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[7:8], v[41:42], 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v7, v23 +; SI-NEXT: v_lshr_b64 v[22:23], v[52:53], 16 +; SI-NEXT: v_mov_b32_e32 v20, v28 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v25 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v14, v36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v15, v51 +; SI-NEXT: v_lshr_b64 v[24:25], v[50:51], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_mov_b32_e32 v57, v24 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshr_b64 v[11:12], v[45:46], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[43:44], 16 +; SI-NEXT: v_mov_b32_e32 v8, v47 +; SI-NEXT: v_mov_b32_e32 v25, v32 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v33 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v15, v24 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v15, v17 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v15, v16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v49 +; SI-NEXT: v_lshr_b64 v[48:49], v[39:40], 16 +; SI-NEXT: v_mov_b32_e32 v35, v16 +; SI-NEXT: v_lshr_b64 v[15:16], v[34:35], 16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[15:16], v[56:57], 16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[15:16], v[54:55], 16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: v_lshr_b64 v[29:30], v[37:38], 16 ; SI-NEXT: .LBB105_5: ; %end -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -234945,19 +236256,19 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_writelane_b32 v42, s31, 1 ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 -; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v13 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v9 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v5 +; VI-NEXT: v_mov_b32_e32 v38, v3 ; VI-NEXT: v_mov_b32_e32 v28, v14 -; VI-NEXT: v_mov_b32_e32 v27, v13 ; VI-NEXT: v_mov_b32_e32 v26, v12 -; VI-NEXT: v_mov_b32_e32 v25, v11 ; VI-NEXT: v_mov_b32_e32 v24, v10 -; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 -; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 -; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v48, v4 ; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec @@ -234968,583 +236279,595 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB105_4 ; VI-NEXT: .LBB105_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v17, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v17 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_lshl_b32 s4, s30, 16 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_f32_e32 v2, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_alignbit_b32 v15, v4, v3, 16 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[2:3] +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v1, s4, v17 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v17 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v17 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v18, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v18, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v2 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v17 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v49, v5, v7, vcc +; VI-NEXT: v_add_f32_e32 v5, s4, v17 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_add_f32_e32 v7, s4, v17 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v17 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_mov_b32_e32 v1, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_add_f32_e32 v7, s4, v17 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v3, v49 +; VI-NEXT: v_cndmask_b32_e32 v49, v9, v11, vcc +; VI-NEXT: v_add_f32_e32 v9, s4, v17 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_add_f32_e32 v11, s4, v17 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; VI-NEXT: v_add_f32_e32 v9, s4, v17 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; VI-NEXT: v_mov_b32_e32 v5, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_add_f32_e32 v11, s4, v17 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v7, v49 +; VI-NEXT: v_cndmask_b32_e32 v49, v13, v15, vcc +; VI-NEXT: v_add_f32_e32 v13, s4, v17 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; VI-NEXT: v_mov_b32_e32 v9, v18 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v17 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v18, v33, vcc -; VI-NEXT: v_add_f32_e32 v18, s4, v0 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v18, v1, 16 -; VI-NEXT: v_add_f32_e32 v18, s4, v0 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v17 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v17 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_mov_b32_e32 v11, v49 +; VI-NEXT: v_cndmask_b32_e32 v49, v17, v19, vcc +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v16 ; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_mov_b32_e32 v13, v18 +; VI-NEXT: v_bfe_u32 v18, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v17 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; VI-NEXT: v_bfe_u32 v35, v34, 16, 1 -; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v34 -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; VI-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc -; VI-NEXT: v_bfe_u32 v35, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v32 -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 -; VI-NEXT: v_or_b32_e32 v36, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v35, v36, vcc -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 -; VI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; VI-NEXT: v_bfe_u32 v36, v35, 16, 1 -; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v35 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v18 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v38 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v19, v21, vcc +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc -; VI-NEXT: v_bfe_u32 v36, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v19 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 -; VI-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 -; VI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; VI-NEXT: v_bfe_u32 v37, v36, 16, 1 -; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v36 -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v48 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v38, v21, v23, vcc +; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v48 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v19, v21, v23, vcc +; VI-NEXT: v_bfe_u32 v21, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v18 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v21, v23, vcc +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v37 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_bfe_u32 v23, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v21 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v23, v25, vcc +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v37 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 +; VI-NEXT: v_mov_b32_e32 v15, v49 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v20 ; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc -; VI-NEXT: v_bfe_u32 v37, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v20 -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 -; VI-NEXT: v_or_b32_e32 v38, 0x400000, v20 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_bfe_u32 v23, v20, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v25, v27, vcc +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v20 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; VI-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; VI-NEXT: v_bfe_u32 v38, v37, 16, 1 -; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v37 -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc -; VI-NEXT: v_bfe_u32 v38, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v21 -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 -; VI-NEXT: v_or_b32_e32 v39, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v22 -; VI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; VI-NEXT: v_bfe_u32 v39, v38, 16, 1 -; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v38 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v21 +; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v23, v25, vcc +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v21, v25, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v23 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v36 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v25, v27, vcc +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v36 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v22 ; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v48, 0x400000, v38 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc -; VI-NEXT: v_bfe_u32 v39, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v22 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 -; VI-NEXT: v_or_b32_e32 v48, 0x400000, v22 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_bfe_u32 v25, v22, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v27, v29, vcc +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v22 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 -; VI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; VI-NEXT: v_bfe_u32 v48, v39, 16, 1 -; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v39 -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v49, 0x400000, v39 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc -; VI-NEXT: v_bfe_u32 v48, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v23 -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 -; VI-NEXT: v_or_b32_e32 v49, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 -; VI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; VI-NEXT: v_bfe_u32 v49, v48, 16, 1 -; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v48 -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v23 +; VI-NEXT: v_bfe_u32 v23, v22, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v25, v25, v27, vcc +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v22 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v23, v27, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v35 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v27, v29, vcc +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v24 ; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v50, 0x400000, v48 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc -; VI-NEXT: v_bfe_u32 v49, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v24 -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 -; VI-NEXT: v_or_b32_e32 v50, 0x400000, v24 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_bfe_u32 v27, v24, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v29, v35, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v24 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v25 -; VI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 -; VI-NEXT: v_bfe_u32 v50, v49, 16, 1 -; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v49 -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v51, 0x400000, v49 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc -; VI-NEXT: v_bfe_u32 v50, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v25 -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 -; VI-NEXT: v_or_b32_e32 v51, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v26 -; VI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 -; VI-NEXT: v_bfe_u32 v51, v50, 16, 1 -; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v50 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v25 +; VI-NEXT: v_bfe_u32 v25, v24, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v27, v27, v29, vcc +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v24 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v25, v29, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v27 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v29, v35, vcc +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v34 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_bfe_u32 v34, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v29 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v26 ; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc -; VI-NEXT: v_bfe_u32 v51, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v26 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 +; VI-NEXT: v_bfe_u32 v29, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v26 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 ; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 -; VI-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 -; VI-NEXT: v_bfe_u32 v52, v51, 16, 1 -; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v51 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v27 +; VI-NEXT: v_bfe_u32 v27, v26, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v29, v29, v52, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v26 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v27, v52, vcc +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v33 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_bfe_u32 v52, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v29 ; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v53, 0x400000, v51 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc -; VI-NEXT: v_bfe_u32 v52, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v27 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_cndmask_b32_e32 v29, v52, v53, vcc +; VI-NEXT: v_bfe_u32 v52, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v33 ; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 -; VI-NEXT: v_or_b32_e32 v53, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v28 -; VI-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 -; VI-NEXT: v_bfe_u32 v53, v52, 16, 1 -; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v52 -; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v28 ; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v54, 0x400000, v52 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc -; VI-NEXT: v_bfe_u32 v53, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v28 -; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 +; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v29 -; VI-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 -; VI-NEXT: v_bfe_u32 v54, v53, 16, 1 -; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v53 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v29 +; VI-NEXT: v_bfe_u32 v29, v28, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v33, v54, vcc +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v28 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 +; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v29, v54, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v33 +; VI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v54, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v33 ; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v55, 0x400000, v53 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc -; VI-NEXT: v_bfe_u32 v54, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v29 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_cndmask_b32_e32 v33, v54, v55, vcc +; VI-NEXT: v_bfe_u32 v54, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v32 ; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 -; VI-NEXT: v_or_b32_e32 v55, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_cndmask_b32_e32 v32, v54, v55, vcc ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v30 -; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 -; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 -; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 -; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 ; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc ; VI-NEXT: v_bfe_u32 v55, v30, 16, 1 ; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v30 ; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 ; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 ; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 -; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 -; VI-NEXT: v_bfe_u32 v40, v55, 16, 1 -; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v55 -; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v41, 0x400000, v55 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc -; VI-NEXT: v_bfe_u32 v40, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v31 +; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 +; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 +; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 +; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 +; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_bfe_u32 v40, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v30 ; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 -; VI-NEXT: v_or_b32_e32 v41, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_bfe_u32 v31, v30, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[48:49] +; VI-NEXT: v_cndmask_b32_e32 v40, v40, v41, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30 +; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[36:37] +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 +; VI-NEXT: v_mov_b32_e32 v37, v48 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[50:51] +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[34:35] +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_cndmask_b32_e32 v30, v31, v41, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v40 +; VI-NEXT: v_mov_b32_e32 v35, v48 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[52:53] ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v31, v31, v55, 16 -; VI-NEXT: v_alignbit_b32 v30, v30, v54, 16 -; VI-NEXT: v_alignbit_b32 v29, v29, v53, 16 -; VI-NEXT: v_alignbit_b32 v28, v28, v52, 16 -; VI-NEXT: v_alignbit_b32 v27, v27, v51, 16 -; VI-NEXT: v_alignbit_b32 v26, v26, v50, 16 -; VI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; VI-NEXT: v_alignbit_b32 v24, v24, v48, 16 -; VI-NEXT: v_alignbit_b32 v23, v23, v39, 16 -; VI-NEXT: v_alignbit_b32 v22, v22, v38, 16 -; VI-NEXT: v_alignbit_b32 v21, v21, v37, 16 -; VI-NEXT: v_alignbit_b32 v20, v20, v36, 16 -; VI-NEXT: v_alignbit_b32 v19, v19, v35, 16 -; VI-NEXT: v_alignbit_b32 v32, v32, v34, 16 -; VI-NEXT: v_alignbit_b32 v17, v17, v33, 16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[50:51], 16, v[30:31] +; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[38:39] +; VI-NEXT: v_mov_b32_e32 v33, v48 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[28:29] +; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[26:27] +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[24:25] +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] +; VI-NEXT: v_mov_b32_e32 v31, v50 ; VI-NEXT: s_branch .LBB105_5 ; VI-NEXT: .LBB105_3: ; VI-NEXT: s_branch .LBB105_2 @@ -235568,7 +236891,14 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: .LBB105_5: ; %end ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_mov_b32_e32 v17, v38 +; VI-NEXT: v_mov_b32_e32 v18, v48 +; VI-NEXT: v_mov_b32_e32 v19, v37 +; VI-NEXT: v_mov_b32_e32 v21, v36 +; VI-NEXT: v_mov_b32_e32 v23, v35 +; VI-NEXT: v_mov_b32_e32 v25, v34 +; VI-NEXT: v_mov_b32_e32 v27, v33 +; VI-NEXT: v_mov_b32_e32 v29, v32 ; VI-NEXT: v_readlane_b32 s31, v42, 1 ; VI-NEXT: v_readlane_b32 s30, v42, 0 ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 @@ -238611,54 +239941,54 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: v_writelane_b32 v40, s85, 29 ; SI-NEXT: v_writelane_b32 v40, s86, 30 ; SI-NEXT: v_writelane_b32 v40, s87, 31 +; SI-NEXT: s_mov_b32 s74, s23 +; SI-NEXT: s_mov_b32 s72, s21 +; SI-NEXT: s_mov_b32 s61, s18 ; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane ; SI-NEXT: s_mov_b32 s60, s16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v41, s17, 0 -; SI-NEXT: s_mov_b32 s61, s19 ; SI-NEXT: v_writelane_b32 v41, s60, 1 -; SI-NEXT: s_mov_b32 s63, s18 -; SI-NEXT: v_writelane_b32 v41, s61, 2 -; SI-NEXT: s_mov_b32 s72, s21 -; SI-NEXT: v_writelane_b32 v41, s63, 3 +; SI-NEXT: v_writelane_b32 v41, s19, 2 +; SI-NEXT: v_writelane_b32 v41, s61, 3 ; SI-NEXT: v_writelane_b32 v41, s72, 4 -; SI-NEXT: s_mov_b32 s74, s23 ; SI-NEXT: v_writelane_b32 v41, s20, 5 ; SI-NEXT: v_writelane_b32 v41, s74, 6 -; SI-NEXT: s_mov_b32 s75, s25 +; SI-NEXT: s_mov_b32 s76, s25 ; SI-NEXT: v_writelane_b32 v41, s22, 7 -; SI-NEXT: v_writelane_b32 v41, s75, 8 -; SI-NEXT: s_mov_b32 s76, s27 +; SI-NEXT: v_writelane_b32 v41, s76, 8 +; SI-NEXT: s_mov_b32 s78, s27 ; SI-NEXT: v_writelane_b32 v41, s24, 9 -; SI-NEXT: v_writelane_b32 v41, s76, 10 -; SI-NEXT: s_mov_b32 s93, s29 +; SI-NEXT: v_writelane_b32 v41, s78, 10 +; SI-NEXT: s_mov_b32 s88, s29 ; SI-NEXT: v_writelane_b32 v41, s26, 11 -; SI-NEXT: v_writelane_b32 v41, s93, 12 -; SI-NEXT: v_readfirstlane_b32 s16, v2 +; SI-NEXT: v_writelane_b32 v41, s88, 12 +; SI-NEXT: v_readfirstlane_b32 s77, v2 ; SI-NEXT: v_writelane_b32 v41, s28, 13 -; SI-NEXT: v_readfirstlane_b32 s73, v4 -; SI-NEXT: v_writelane_b32 v41, s16, 14 -; SI-NEXT: v_readfirstlane_b32 s89, v3 -; SI-NEXT: v_writelane_b32 v41, s73, 15 -; SI-NEXT: v_readfirstlane_b32 s90, v6 -; SI-NEXT: v_writelane_b32 v41, s89, 16 -; SI-NEXT: v_readfirstlane_b32 s91, v5 -; SI-NEXT: v_writelane_b32 v41, s90, 17 -; SI-NEXT: v_readfirstlane_b32 s34, v8 -; SI-NEXT: v_writelane_b32 v41, s91, 18 -; SI-NEXT: v_readfirstlane_b32 s35, v7 -; SI-NEXT: v_writelane_b32 v41, s34, 19 -; SI-NEXT: v_readfirstlane_b32 s36, v10 -; SI-NEXT: v_writelane_b32 v41, s35, 20 -; SI-NEXT: v_writelane_b32 v40, s96, 32 -; SI-NEXT: v_readfirstlane_b32 s37, v9 -; SI-NEXT: v_writelane_b32 v41, s36, 21 +; SI-NEXT: v_readfirstlane_b32 s79, v4 +; SI-NEXT: v_writelane_b32 v41, s77, 14 +; SI-NEXT: v_readfirstlane_b32 s90, v3 +; SI-NEXT: v_writelane_b32 v41, s79, 15 +; SI-NEXT: v_readfirstlane_b32 s91, v6 +; SI-NEXT: v_writelane_b32 v41, s90, 16 +; SI-NEXT: v_readfirstlane_b32 s92, v5 +; SI-NEXT: v_writelane_b32 v41, s91, 17 +; SI-NEXT: v_readfirstlane_b32 s93, v8 +; SI-NEXT: v_writelane_b32 v41, s92, 18 +; SI-NEXT: v_readfirstlane_b32 s94, v7 +; SI-NEXT: v_writelane_b32 v41, s93, 19 +; SI-NEXT: v_readfirstlane_b32 s95, v10 +; SI-NEXT: v_writelane_b32 v41, s94, 20 +; SI-NEXT: v_readfirstlane_b32 s30, v9 +; SI-NEXT: v_writelane_b32 v41, s95, 21 +; SI-NEXT: v_readfirstlane_b32 s31, v12 +; SI-NEXT: v_writelane_b32 v41, s30, 22 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s62, v31 +; SI-NEXT: v_readfirstlane_b32 s21, v31 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s80, v32 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s69, v33 +; SI-NEXT: v_readfirstlane_b32 s75, v33 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 @@ -238670,20 +240000,25 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s84, v34 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s68, v35 +; SI-NEXT: v_readfirstlane_b32 s23, v35 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s83, v36 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s87, v38 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; SI-NEXT: v_readfirstlane_b32 s6, v37 +; SI-NEXT: v_readfirstlane_b32 s18, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 +; SI-NEXT: v_writelane_b32 v41, s31, 23 +; SI-NEXT: v_readfirstlane_b32 s34, v11 +; SI-NEXT: v_readfirstlane_b32 s35, v14 +; SI-NEXT: v_readfirstlane_b32 s36, v13 +; SI-NEXT: v_writelane_b32 v40, s96, 32 +; SI-NEXT: v_readfirstlane_b32 s37, v16 ; SI-NEXT: v_writelane_b32 v40, s97, 33 -; SI-NEXT: v_readfirstlane_b32 s38, v12 -; SI-NEXT: v_writelane_b32 v41, s37, 22 +; SI-NEXT: v_readfirstlane_b32 s38, v15 ; SI-NEXT: v_writelane_b32 v40, s98, 34 ; SI-NEXT: v_readfirstlane_b32 s14, v30 ; SI-NEXT: v_readfirstlane_b32 s15, v29 @@ -238693,21 +240028,13 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: v_readfirstlane_b32 s11, v25 ; SI-NEXT: v_readfirstlane_b32 s8, v24 ; SI-NEXT: v_readfirstlane_b32 s9, v23 -; SI-NEXT: v_readfirstlane_b32 s88, v22 -; SI-NEXT: v_readfirstlane_b32 s29, v21 -; SI-NEXT: v_readfirstlane_b32 s79, v20 -; SI-NEXT: v_readfirstlane_b32 s27, v19 -; SI-NEXT: v_readfirstlane_b32 s78, v18 -; SI-NEXT: v_readfirstlane_b32 s25, v17 -; SI-NEXT: v_readfirstlane_b32 s77, v16 -; SI-NEXT: v_readfirstlane_b32 s23, v15 -; SI-NEXT: v_readfirstlane_b32 s39, v14 -; SI-NEXT: v_readfirstlane_b32 s21, v13 -; SI-NEXT: v_readfirstlane_b32 s19, v11 -; SI-NEXT: v_readfirstlane_b32 s18, v1 -; SI-NEXT: v_writelane_b32 v41, s38, 23 +; SI-NEXT: v_readfirstlane_b32 s89, v22 +; SI-NEXT: v_readfirstlane_b32 s7, v21 +; SI-NEXT: v_readfirstlane_b32 s25, v20 +; SI-NEXT: v_readfirstlane_b32 s29, v19 +; SI-NEXT: v_readfirstlane_b32 s39, v18 +; SI-NEXT: v_readfirstlane_b32 s27, v17 ; SI-NEXT: v_writelane_b32 v40, s99, 35 -; SI-NEXT: v_writelane_b32 v41, s39, 24 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s58, v31 ; SI-NEXT: s_waitcnt vmcnt(11) @@ -238727,261 +240054,289 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_readfirstlane_b32 s42, v34 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: v_writelane_b32 v41, s5, 24 +; SI-NEXT: v_writelane_b32 v41, s34, 25 +; SI-NEXT: v_writelane_b32 v41, s35, 26 +; SI-NEXT: v_writelane_b32 v41, s36, 27 +; SI-NEXT: v_writelane_b32 v41, s37, 28 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s43, v35 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_readfirstlane_b32 s40, v36 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s41, v37 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_writelane_b32 v41, s38, 29 +; SI-NEXT: v_writelane_b32 v41, s39, 30 ; SI-NEXT: s_cbranch_scc0 .LBB107_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshl_b32 s4, s60, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 25 -; SI-NEXT: s_lshl_b32 s4, s63, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 26 -; SI-NEXT: s_lshl_b32 s4, s20, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 27 -; SI-NEXT: s_lshl_b32 s4, s22, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 28 -; SI-NEXT: s_lshl_b32 s4, s24, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 29 -; SI-NEXT: s_lshl_b32 s4, s26, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 30 -; SI-NEXT: s_lshl_b32 s4, s28, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 31 -; SI-NEXT: s_lshl_b32 s4, s18, 16 ; SI-NEXT: v_writelane_b32 v41, s4, 32 -; SI-NEXT: s_lshl_b32 s4, s89, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 33 -; SI-NEXT: s_lshl_b32 s4, s91, 16 +; SI-NEXT: s_lshl_b32 s4, s17, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 31 +; SI-NEXT: s_lshl_b32 s4, s61, 16 ; SI-NEXT: v_writelane_b32 v41, s4, 34 -; SI-NEXT: s_lshl_b32 s4, s35, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 35 -; SI-NEXT: s_lshl_b32 s4, s37, 16 -; SI-NEXT: s_lshl_b32 s7, s17, 16 -; SI-NEXT: s_lshl_b32 s96, s61, 16 -; SI-NEXT: s_lshl_b32 s99, s72, 16 -; SI-NEXT: s_lshl_b32 s97, s74, 16 -; SI-NEXT: s_lshl_b32 s92, s75, 16 -; SI-NEXT: s_lshl_b32 s94, s76, 16 -; SI-NEXT: s_lshl_b32 s95, s93, 16 -; SI-NEXT: s_lshl_b32 s93, s16, 16 -; SI-NEXT: s_lshl_b32 s30, s73, 16 -; SI-NEXT: s_lshl_b32 s31, s90, 16 -; SI-NEXT: s_lshl_b32 s34, s34, 16 +; SI-NEXT: s_lshl_b32 s4, s19, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 33 +; SI-NEXT: s_lshl_b32 s4, s20, 16 ; SI-NEXT: v_writelane_b32 v41, s4, 36 -; SI-NEXT: s_lshl_b32 s35, s36, 16 -; SI-NEXT: s_lshl_b32 s86, s19, 16 -; SI-NEXT: s_lshl_b32 s36, s38, 16 -; SI-NEXT: s_lshl_b32 s22, s21, 16 -; SI-NEXT: s_lshl_b32 s37, s39, 16 -; SI-NEXT: s_lshl_b32 s24, s23, 16 -; SI-NEXT: s_lshl_b32 s38, s77, 16 -; SI-NEXT: s_lshl_b32 s28, s25, 16 -; SI-NEXT: s_lshl_b32 s39, s78, 16 -; SI-NEXT: s_lshl_b32 s61, s27, 16 -; SI-NEXT: s_lshl_b32 s48, s79, 16 -; SI-NEXT: s_lshl_b32 s89, s29, 16 -; SI-NEXT: s_lshl_b32 s49, s88, 16 -; SI-NEXT: s_lshl_b32 s60, s9, 16 -; SI-NEXT: s_lshl_b32 s50, s8, 16 -; SI-NEXT: s_lshl_b32 s90, s11, 16 -; SI-NEXT: s_lshl_b32 s91, s10, 16 -; SI-NEXT: s_lshl_b32 s70, s13, 16 -; SI-NEXT: s_lshl_b32 s51, s12, 16 -; SI-NEXT: s_lshl_b32 s71, s15, 16 -; SI-NEXT: s_lshl_b32 s52, s14, 16 -; SI-NEXT: s_lshl_b32 s20, s41, 16 -; SI-NEXT: s_lshl_b32 s53, s40, 16 -; SI-NEXT: s_lshl_b32 s81, s43, 16 -; SI-NEXT: s_lshl_b32 s54, s42, 16 -; SI-NEXT: s_lshl_b32 s63, s45, 16 -; SI-NEXT: s_lshl_b32 s55, s44, 16 -; SI-NEXT: s_lshl_b32 s72, s47, 16 -; SI-NEXT: s_lshl_b32 s64, s46, 16 -; SI-NEXT: s_lshl_b32 s82, s57, 16 -; SI-NEXT: s_lshl_b32 s65, s56, 16 -; SI-NEXT: s_lshl_b32 s74, s59, 16 -; SI-NEXT: s_lshl_b32 s66, s58, 16 -; SI-NEXT: s_lshl_b32 s75, s87, 16 -; SI-NEXT: s_mov_b32 s73, s6 -; SI-NEXT: s_lshl_b32 s67, s6, 16 -; SI-NEXT: s_lshl_b32 s76, s83, 16 -; SI-NEXT: s_mov_b32 s16, s68 -; SI-NEXT: s_lshl_b32 s68, s68, 16 -; SI-NEXT: s_lshl_b32 s85, s84, 16 -; SI-NEXT: s_mov_b32 s98, s69 -; SI-NEXT: s_lshl_b32 s69, s69, 16 -; SI-NEXT: s_lshl_b32 s17, s80, 16 -; SI-NEXT: s_mov_b32 s6, s62 -; SI-NEXT: s_lshl_b32 s26, s62, 16 +; SI-NEXT: s_lshl_b32 s4, s72, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 35 +; SI-NEXT: s_lshl_b32 s4, s74, 16 +; SI-NEXT: s_lshl_b32 s16, s22, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 37 +; SI-NEXT: s_lshl_b32 s6, s24, 16 +; SI-NEXT: s_lshl_b32 s73, s76, 16 +; SI-NEXT: s_lshl_b32 s98, s26, 16 +; SI-NEXT: s_lshl_b32 s63, s78, 16 +; SI-NEXT: s_lshl_b32 s96, s28, 16 +; SI-NEXT: s_lshl_b32 s62, s88, 16 +; SI-NEXT: s_lshl_b32 s97, s5, 16 +; SI-NEXT: s_lshl_b32 s99, s77, 16 +; SI-NEXT: s_lshl_b32 s85, s90, 16 +; SI-NEXT: s_lshl_b32 s86, s79, 16 +; SI-NEXT: s_lshl_b32 s81, s92, 16 +; SI-NEXT: s_lshl_b32 s82, s91, 16 +; SI-NEXT: s_lshl_b32 s70, s94, 16 +; SI-NEXT: s_lshl_b32 s71, s93, 16 +; SI-NEXT: s_lshl_b32 s68, s30, 16 +; SI-NEXT: s_lshl_b32 s69, s95, 16 +; SI-NEXT: s_lshl_b32 s66, s34, 16 +; SI-NEXT: s_lshl_b32 s67, s31, 16 +; SI-NEXT: s_lshl_b32 s64, s36, 16 +; SI-NEXT: s_lshl_b32 s65, s35, 16 +; SI-NEXT: s_lshl_b32 s54, s38, 16 +; SI-NEXT: s_lshl_b32 s55, s37, 16 +; SI-NEXT: s_lshl_b32 s52, s27, 16 +; SI-NEXT: s_lshl_b32 s53, s39, 16 +; SI-NEXT: s_lshl_b32 s50, s29, 16 +; SI-NEXT: s_lshl_b32 s51, s25, 16 +; SI-NEXT: s_lshl_b32 s48, s7, 16 +; SI-NEXT: s_lshl_b32 s49, s89, 16 +; SI-NEXT: s_lshl_b32 s38, s9, 16 +; SI-NEXT: s_lshl_b32 s39, s8, 16 +; SI-NEXT: s_lshl_b32 s37, s11, 16 +; SI-NEXT: s_lshl_b32 s35, s10, 16 +; SI-NEXT: s_lshl_b32 s31, s13, 16 +; SI-NEXT: s_lshl_b32 s36, s12, 16 +; SI-NEXT: s_lshl_b32 s95, s15, 16 +; SI-NEXT: s_lshl_b32 s34, s14, 16 +; SI-NEXT: s_lshl_b32 s93, s41, 16 +; SI-NEXT: s_lshl_b32 s30, s40, 16 +; SI-NEXT: s_lshl_b32 s91, s43, 16 +; SI-NEXT: s_lshl_b32 s94, s42, 16 +; SI-NEXT: s_lshl_b32 s92, s45, 16 +; SI-NEXT: s_lshl_b32 s90, s44, 16 +; SI-NEXT: s_lshl_b32 s88, s47, 16 +; SI-NEXT: s_lshl_b32 s28, s46, 16 +; SI-NEXT: s_lshl_b32 s78, s57, 16 +; SI-NEXT: s_lshl_b32 s26, s56, 16 +; SI-NEXT: s_lshl_b32 s76, s59, 16 +; SI-NEXT: s_lshl_b32 s24, s58, 16 +; SI-NEXT: s_lshl_b32 s74, s87, 16 +; SI-NEXT: s_mov_b32 s77, s18 +; SI-NEXT: s_lshl_b32 s22, s18, 16 +; SI-NEXT: s_lshl_b32 s72, s83, 16 +; SI-NEXT: s_mov_b32 s79, s23 +; SI-NEXT: s_lshl_b32 s20, s23, 16 +; SI-NEXT: s_lshl_b32 s61, s84, 16 +; SI-NEXT: s_mov_b32 s18, s75 +; SI-NEXT: s_lshl_b32 s19, s75, 16 +; SI-NEXT: s_lshl_b32 s60, s80, 16 +; SI-NEXT: s_lshl_b32 s17, s21, 16 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB107_3 ; SI-NEXT: .LBB107_2: -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s16, s68 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s73, s6 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s6, s62 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s98, s69 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: s_mov_b32 s79, s23 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: s_mov_b32 s77, s18 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: s_mov_b32 s18, s75 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr63 ; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr99 +; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr97 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr95 -; SI-NEXT: ; implicit-def: $sgpr93 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr31 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr99 +; SI-NEXT: ; implicit-def: $sgpr85 ; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr37 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr39 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr89 -; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr91 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr71 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr53 ; SI-NEXT: ; implicit-def: $sgpr81 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr71 ; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr85 ; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: .LBB107_3: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_mov_b32 s4, s60 ; SI-NEXT: s_mov_b32 s5, s17 -; SI-NEXT: s_mov_b32 s17, s86 -; SI-NEXT: s_mov_b32 s86, s7 +; SI-NEXT: s_mov_b32 s17, s61 +; SI-NEXT: s_mov_b32 s60, s72 +; SI-NEXT: s_mov_b32 s61, s74 +; SI-NEXT: s_mov_b32 s72, s76 +; SI-NEXT: s_mov_b32 s74, s78 +; SI-NEXT: s_mov_b32 s76, s88 +; SI-NEXT: s_mov_b32 s78, s92 +; SI-NEXT: s_mov_b32 s88, s91 +; SI-NEXT: s_mov_b32 s91, s93 +; SI-NEXT: s_mov_b32 s92, s94 +; SI-NEXT: s_mov_b32 s93, s95 +; SI-NEXT: s_mov_b32 s94, s30 +; SI-NEXT: s_mov_b32 s95, s31 +; SI-NEXT: s_mov_b32 s30, s34 +; SI-NEXT: s_mov_b32 s31, s37 +; SI-NEXT: s_mov_b32 s34, s36 +; SI-NEXT: s_mov_b32 s36, s38 +; SI-NEXT: s_mov_b32 s37, s39 +; SI-NEXT: s_mov_b32 s38, s48 +; SI-NEXT: s_mov_b32 s39, s49 +; SI-NEXT: s_mov_b32 s48, s50 +; SI-NEXT: s_mov_b32 s49, s51 +; SI-NEXT: s_mov_b32 s50, s52 +; SI-NEXT: s_mov_b32 s51, s53 +; SI-NEXT: s_mov_b32 s52, s54 +; SI-NEXT: s_mov_b32 s53, s55 +; SI-NEXT: s_mov_b32 s54, s6 +; SI-NEXT: s_mov_b32 s55, s16 ; SI-NEXT: s_cbranch_vccnz .LBB107_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_lshl_b32 s5, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 24 -; SI-NEXT: s_lshl_b32 s20, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 23 -; SI-NEXT: s_lshl_b32 s17, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 22 -; SI-NEXT: s_lshl_b32 s61, s16, 16 -; SI-NEXT: s_add_i32 s16, s6, 3 -; SI-NEXT: v_readlane_b32 s6, v41, 21 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s7, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 20 -; SI-NEXT: s_or_b32 s7, s7, s16 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_readlane_b32 s16, v41, 19 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_and_b32 s19, s19, 0xffff -; SI-NEXT: s_or_b32 s6, s16, s6 -; SI-NEXT: v_readlane_b32 s16, v41, 18 -; SI-NEXT: s_lshl_b32 s60, s98, 16 -; SI-NEXT: s_or_b32 s17, s17, s19 -; SI-NEXT: s_add_i32 s98, s16, 3 -; SI-NEXT: v_readlane_b32 s19, v41, 17 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_and_b32 s16, s98, 0xffff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_and_b32 s21, s21, 0xffff -; SI-NEXT: s_or_b32 s16, s19, s16 -; SI-NEXT: v_readlane_b32 s19, v41, 16 ; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 30 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s13, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 29 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_lshl_b32 s11, s25, 16 +; SI-NEXT: s_add_i32 s25, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 28 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_lshl_b32 s15, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 27 +; SI-NEXT: s_add_i32 s23, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 26 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshl_b32 s20, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 25 +; SI-NEXT: s_lshl_b32 s5, s21, 16 ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: s_add_i32 s96, s19, 3 -; SI-NEXT: v_readlane_b32 s21, v41, 15 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_and_b32 s13, s13, 0xffff -; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s21, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 23 ; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s9, s89, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_lshl_b32 s19, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 22 +; SI-NEXT: s_or_b32 s7, s9, s7 ; SI-NEXT: s_and_b32 s9, s29, 0xffff -; SI-NEXT: s_lshl_b32 s11, s88, 16 ; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_and_b32 s19, s96, 0xffff -; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: s_and_b32 s15, s15, 0xffff -; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_add_i32 s16, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 21 ; SI-NEXT: s_or_b32 s9, s11, s9 ; SI-NEXT: s_and_b32 s11, s27, 0xffff -; SI-NEXT: s_lshl_b32 s13, s79, 16 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_or_b32 s19, s21, s19 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_readlane_b32 s21, v41, 14 -; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 20 ; SI-NEXT: s_or_b32 s11, s13, s11 ; SI-NEXT: s_and_b32 s13, s25, 0xffff -; SI-NEXT: s_lshl_b32 s15, s78, 16 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 19 ; SI-NEXT: s_or_b32 s13, s15, s13 ; SI-NEXT: s_and_b32 s15, s23, 0xffff -; SI-NEXT: s_lshl_b32 s22, s77, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s60, s18, 16 +; SI-NEXT: s_or_b32 s15, s20, s15 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_or_b32 s6, s17, s6 +; SI-NEXT: v_readlane_b32 s17, v41, 18 +; SI-NEXT: v_readlane_b32 s18, v41, 17 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_add_i32 s98, s17, 3 +; SI-NEXT: s_lshl_b32 s20, s18, 16 +; SI-NEXT: v_readlane_b32 s18, v41, 16 +; SI-NEXT: s_and_b32 s17, s98, 0xffff +; SI-NEXT: s_add_i32 s96, s18, 3 +; SI-NEXT: v_readlane_b32 s18, v41, 15 +; SI-NEXT: s_or_b32 s17, s20, s17 +; SI-NEXT: s_and_b32 s20, s96, 0xffff +; SI-NEXT: s_lshl_b32 s21, s18, 16 +; SI-NEXT: v_readlane_b32 s18, v41, 24 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_readlane_b32 s21, v41, 14 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s21, s21, 16 ; SI-NEXT: s_or_b32 s18, s21, s18 ; SI-NEXT: v_readlane_b32 s21, v41, 13 -; SI-NEXT: s_or_b32 s15, s22, s15 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_readlane_b32 s22, v41, 12 ; SI-NEXT: s_and_b32 s21, s21, 0xffff @@ -239023,42 +240378,20 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_lshl_b32 s28, s28, 16 ; SI-NEXT: s_or_b32 s27, s28, s27 -; SI-NEXT: s_add_i32 s27, s27, 0x30000 -; SI-NEXT: s_add_i32 s26, s26, 0x30000 -; SI-NEXT: s_and_b32 s86, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s27, s27, 16 -; SI-NEXT: s_add_i32 s25, s25, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s27, 25 -; SI-NEXT: s_and_b32 s96, s26, 0xffff0000 -; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: s_add_i32 s24, s24, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s26, 26 -; SI-NEXT: s_and_b32 s99, s25, 0xffff0000 -; SI-NEXT: s_lshl_b32 s25, s25, 16 -; SI-NEXT: s_add_i32 s23, s23, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s25, 27 -; SI-NEXT: s_and_b32 s97, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s24, s24, 16 ; SI-NEXT: s_add_i32 s80, s80, 3 -; SI-NEXT: s_add_i32 s22, s22, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s24, 28 -; SI-NEXT: s_and_b32 s92, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_add_i32 s27, s27, 0x30000 ; SI-NEXT: s_and_b32 s4, s80, 0xffff ; SI-NEXT: s_add_i32 s84, s84, 3 -; SI-NEXT: s_add_i32 s21, s21, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s23, 29 -; SI-NEXT: s_and_b32 s94, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s28, s27, 0xffff0000 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s84, 0xffff ; SI-NEXT: s_add_i32 s83, s83, 3 -; SI-NEXT: s_add_i32 s18, s18, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s22, 30 -; SI-NEXT: s_and_b32 s95, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_add_i32 s26, s26, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s28, 31 +; SI-NEXT: s_lshl_b32 s27, s27, 16 ; SI-NEXT: s_or_b32 s5, s60, s5 ; SI-NEXT: s_and_b32 s60, s83, 0xffff +; SI-NEXT: s_lshl_b32 s61, s79, 16 ; SI-NEXT: s_add_i32 s87, s87, 3 ; SI-NEXT: s_add_i32 s59, s59, 3 ; SI-NEXT: s_add_i32 s57, s57, 3 @@ -239066,13 +240399,11 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_add_i32 s45, s45, 3 ; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s19, s19, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s21, 31 -; SI-NEXT: s_and_b32 s93, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s76, s61, s60 +; SI-NEXT: v_writelane_b32 v41, s27, 32 +; SI-NEXT: s_and_b32 s27, s26, 0xffff0000 +; SI-NEXT: s_or_b32 vcc_lo, s61, s60 ; SI-NEXT: s_and_b32 s60, s87, 0xffff -; SI-NEXT: s_lshl_b32 s61, s73, 16 +; SI-NEXT: s_lshl_b32 s61, s77, 16 ; SI-NEXT: s_and_b32 s59, s59, 0xffff ; SI-NEXT: s_lshl_b32 s58, s58, 16 ; SI-NEXT: s_and_b32 s57, s57, 0xffff @@ -239085,24 +240416,22 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_lshl_b32 s42, s42, 16 ; SI-NEXT: s_and_b32 s41, s41, 0xffff ; SI-NEXT: s_lshl_b32 s40, s40, 16 -; SI-NEXT: s_add_i32 s16, s16, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s18, 32 -; SI-NEXT: s_lshl_b32 s18, s19, 16 -; SI-NEXT: s_or_b32 s75, s61, s60 +; SI-NEXT: s_add_i32 s25, s25, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s27, 33 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 vcc_hi, s61, s60 ; SI-NEXT: s_or_b32 s58, s58, s59 ; SI-NEXT: s_or_b32 s56, s56, s57 ; SI-NEXT: s_or_b32 s46, s46, s47 ; SI-NEXT: s_or_b32 s44, s44, s45 ; SI-NEXT: s_or_b32 s42, s42, s43 ; SI-NEXT: s_or_b32 s40, s40, s41 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s18, 33 -; SI-NEXT: s_and_b32 s31, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_writelane_b32 v41, s26, 34 +; SI-NEXT: s_and_b32 s26, s25, 0xffff0000 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s76, s76, 0x30000 -; SI-NEXT: s_add_i32 s75, s75, 0x30000 +; SI-NEXT: s_add_i32 vcc_lo, vcc_lo, 0x30000 +; SI-NEXT: s_add_i32 vcc_hi, vcc_hi, 0x30000 ; SI-NEXT: s_add_i32 s58, s58, 0x30000 ; SI-NEXT: s_add_i32 s56, s56, 0x30000 ; SI-NEXT: s_add_i32 s46, s46, 0x30000 @@ -239113,294 +240442,311 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_add_i32 s12, s12, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s13, s13, 0x30000 ; SI-NEXT: s_add_i32 s15, s15, 0x30000 -; SI-NEXT: s_add_i32 s20, s20, 0x30000 +; SI-NEXT: s_add_i32 s19, s19, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 ; SI-NEXT: s_add_i32 s17, s17, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s16, 34 -; SI-NEXT: s_and_b32 s34, s6, 0xffff0000 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s30, s19, 0xffff0000 -; SI-NEXT: v_writelane_b32 v41, s6, 35 -; SI-NEXT: s_and_b32 s35, s7, 0xffff0000 -; SI-NEXT: s_lshl_b32 s6, s7, 16 -; SI-NEXT: s_and_b32 s36, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_and_b32 s37, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s22, s20, 16 -; SI-NEXT: s_and_b32 s38, s15, 0xffff0000 -; SI-NEXT: s_lshl_b32 s24, s15, 16 -; SI-NEXT: s_and_b32 s39, s13, 0xffff0000 -; SI-NEXT: s_lshl_b32 s28, s13, 16 -; SI-NEXT: s_and_b32 s48, s11, 0xffff0000 -; SI-NEXT: s_lshl_b32 s61, s11, 16 +; SI-NEXT: s_add_i32 s20, s20, 0x30000 +; SI-NEXT: s_add_i32 s18, s18, 0x30000 +; SI-NEXT: s_add_i32 s21, s21, 0x30000 +; SI-NEXT: s_add_i32 s22, s22, 0x30000 +; SI-NEXT: s_add_i32 s23, s23, 0x30000 +; SI-NEXT: s_add_i32 s24, s24, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s26, 35 +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: v_writelane_b32 v41, s25, 36 +; SI-NEXT: s_and_b32 s25, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s55, s24, 16 +; SI-NEXT: s_and_b32 s73, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s54, s23, 16 +; SI-NEXT: s_and_b32 s63, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s98, s22, 16 +; SI-NEXT: s_and_b32 s62, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s96, s21, 16 +; SI-NEXT: s_and_b32 s99, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s97, s18, 16 +; SI-NEXT: s_and_b32 s86, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s85, s20, 16 +; SI-NEXT: s_and_b32 s82, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s81, s17, 16 +; SI-NEXT: s_and_b32 s71, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s70, s6, 16 +; SI-NEXT: s_and_b32 s69, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s68, s16, 16 +; SI-NEXT: s_and_b32 s67, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s66, s19, 16 +; SI-NEXT: s_and_b32 s65, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s64, s15, 16 +; SI-NEXT: s_and_b32 s53, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s52, s13, 16 +; SI-NEXT: s_and_b32 s51, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s50, s11, 16 ; SI-NEXT: s_and_b32 s49, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s89, s9, 16 -; SI-NEXT: s_and_b32 s50, s8, 0xffff0000 -; SI-NEXT: s_lshl_b32 s60, s8, 16 -; SI-NEXT: s_and_b32 s91, s10, 0xffff0000 -; SI-NEXT: s_lshl_b32 s90, s10, 16 -; SI-NEXT: s_and_b32 s51, s12, 0xffff0000 -; SI-NEXT: s_lshl_b32 s70, s12, 16 -; SI-NEXT: s_and_b32 s52, s14, 0xffff0000 -; SI-NEXT: s_lshl_b32 s71, s14, 16 -; SI-NEXT: s_and_b32 s53, s40, 0xffff0000 -; SI-NEXT: s_lshl_b32 s20, s40, 16 -; SI-NEXT: s_and_b32 s54, s42, 0xffff0000 -; SI-NEXT: s_lshl_b32 s81, s42, 16 -; SI-NEXT: s_and_b32 s55, s44, 0xffff0000 -; SI-NEXT: s_lshl_b32 s63, s44, 16 -; SI-NEXT: s_and_b32 s64, s46, 0xffff0000 -; SI-NEXT: s_lshl_b32 s72, s46, 16 -; SI-NEXT: s_and_b32 s65, s56, 0xffff0000 -; SI-NEXT: s_lshl_b32 s82, s56, 16 -; SI-NEXT: s_and_b32 s66, s58, 0xffff0000 -; SI-NEXT: s_lshl_b32 s74, s58, 16 -; SI-NEXT: s_and_b32 s67, s75, 0xffff0000 -; SI-NEXT: s_lshl_b32 s75, s75, 16 -; SI-NEXT: s_and_b32 s68, s76, 0xffff0000 -; SI-NEXT: s_lshl_b32 s76, s76, 16 -; SI-NEXT: s_and_b32 s69, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s85, s5, 16 -; SI-NEXT: s_and_b32 s26, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s5, s4, 16 -; SI-NEXT: v_writelane_b32 v41, s6, 36 +; SI-NEXT: s_lshl_b32 s48, s9, 16 +; SI-NEXT: s_and_b32 s39, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s38, s7, 16 +; SI-NEXT: s_and_b32 s37, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s36, s8, 16 +; SI-NEXT: s_and_b32 s35, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s31, s10, 16 +; SI-NEXT: s_and_b32 s34, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s95, s12, 16 +; SI-NEXT: s_and_b32 s30, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s93, s14, 16 +; SI-NEXT: s_and_b32 s94, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s91, s40, 16 +; SI-NEXT: s_and_b32 s92, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s42, 16 +; SI-NEXT: s_and_b32 s90, s44, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s44, 16 +; SI-NEXT: s_and_b32 s28, s46, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s46, 16 +; SI-NEXT: s_and_b32 s26, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s56, 16 +; SI-NEXT: s_and_b32 s24, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s58, 16 +; SI-NEXT: s_and_b32 s22, vcc_hi, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, vcc_hi, 16 +; SI-NEXT: s_and_b32 s20, vcc_lo, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, vcc_lo, 16 +; SI-NEXT: s_and_b32 s19, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s5, 16 +; SI-NEXT: s_and_b32 s5, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_writelane_b32 v41, s25, 37 ; SI-NEXT: .LBB107_5: ; %end -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s86 -; SI-NEXT: v_readlane_b32 s4, v41, 25 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 31 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v41, 32 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_readlane_b32 s6, v41, 33 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s96 -; SI-NEXT: v_readlane_b32 s4, v41, 26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v41, 34 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: v_readlane_b32 s6, v41, 35 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 -; SI-NEXT: v_readlane_b32 s4, v41, 27 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v41, 36 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: v_readlane_b32 s6, v41, 37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 -; SI-NEXT: v_readlane_b32 s4, v41, 28 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 -; SI-NEXT: v_readlane_b32 s4, v41, 29 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 -; SI-NEXT: v_readlane_b32 s4, v41, 30 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s98 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 -; SI-NEXT: v_readlane_b32 s4, v41, 31 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s96 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 -; SI-NEXT: v_readlane_b32 s4, v41, 32 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 -; SI-NEXT: v_readlane_b32 s4, v41, 33 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s86 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s85 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 -; SI-NEXT: v_readlane_b32 s4, v41, 34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s82 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s81 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 -; SI-NEXT: v_readlane_b32 s4, v41, 35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s71 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s70 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 -; SI-NEXT: v_readlane_b32 s4, v41, 36 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s24 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s61 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s89 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s71 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s81 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s63 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s82 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s75 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s85 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: v_readlane_b32 s99, v40, 35 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index c6211aae19c1b..078ba76eb1f12 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -2853,50 +2853,52 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshr_b64 v[0:1], v[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: .LBB23_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: @@ -2912,78 +2914,80 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB23_4 ; VI-NEXT: .LBB23_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v7, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[5:6], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v5 +; VI-NEXT: v_mov_b32_e32 v3, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB23_3: ; VI-NEXT: s_branch .LBB23_2 @@ -7392,50 +7396,52 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshr_b64 v[0:1], v[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: .LBB47_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: @@ -7451,78 +7457,80 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB47_4 ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v7, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[5:6], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v5 +; VI-NEXT: v_mov_b32_e32 v3, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB47_3: ; VI-NEXT: s_branch .LBB47_2 @@ -11581,50 +11589,52 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshr_b64 v[0:1], v[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: .LBB67_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: @@ -11640,78 +11650,80 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB67_4 ; VI-NEXT: .LBB67_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v7, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[5:6], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v5 +; VI-NEXT: v_mov_b32_e32 v3, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB67_3: ; VI-NEXT: s_branch .LBB67_2 @@ -15349,50 +15361,52 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshr_b64 v[0:1], v[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: .LBB83_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB83_4: @@ -15408,78 +15422,80 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB83_4 ; VI-NEXT: .LBB83_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v7, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[5:6], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v5 +; VI-NEXT: v_mov_b32_e32 v3, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB83_3: ; VI-NEXT: s_branch .LBB83_2 @@ -18720,64 +18736,66 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3 ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: v_mul_f32_e64 v15, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v14, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v13, 1.0, s20 ; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s23 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v12 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshr_b64 v[4:5], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v6, v7, v3, 16 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshr_b64 v[8:9], v[6:7], 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: v_lshr_b64 v[10:11], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[8:9], v[5:6], 16 -; SI-NEXT: v_alignbit_b32 v4, v12, v13, 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 +; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 ; SI-NEXT: .LBB95_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v10 -; SI-NEXT: v_mov_b32_e32 v5, v8 +; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_branch .LBB95_2 ; @@ -18790,78 +18808,80 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB95_4 ; VI-NEXT: .LBB95_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_mov_b32_e32 v7, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v7 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v7 +; VI-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v7 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v7 +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc +; VI-NEXT: v_add_f32_e32 v6, s4, v7 +; VI-NEXT: v_bfe_u32 v8, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v6 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; VI-NEXT: v_bfe_u32 v8, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v7, s4, v7 +; VI-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc +; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; VI-NEXT: v_alignbit_b32 v2, v7, v2, 16 -; VI-NEXT: v_alignbit_b32 v1, v6, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v3 +; VI-NEXT: v_mov_b32_e32 v3, v6 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB95_3: ; VI-NEXT: s_branch .LBB95_2 @@ -21770,78 +21790,80 @@ define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i ; VI-NEXT: s_cbranch_execnz .LBB103_4 ; VI-NEXT: .LBB103_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_mov_b32_e32 v7, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v7 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v7 +; VI-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v7 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v7 +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc +; VI-NEXT: v_add_f32_e32 v6, s4, v7 +; VI-NEXT: v_bfe_u32 v8, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v6 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; VI-NEXT: v_bfe_u32 v8, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v7, s4, v7 +; VI-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc +; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; VI-NEXT: v_alignbit_b32 v2, v7, v2, 16 -; VI-NEXT: v_alignbit_b32 v1, v6, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v3 +; VI-NEXT: v_mov_b32_e32 v3, v6 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB103_3: ; VI-NEXT: s_branch .LBB103_2 @@ -24532,94 +24554,97 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v24, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_alignbit_b32 v19, v1, v16, 16 -; SI-NEXT: v_alignbit_b32 v20, v6, v8, 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v23 -; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 24 -; SI-NEXT: v_alignbit_b32 v21, v2, v26, 16 -; SI-NEXT: v_alignbit_b32 v22, v14, v24, 16 -; SI-NEXT: v_lshr_b64 v[4:5], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[10:11], v[21:22], 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v22 -; SI-NEXT: v_lshr_b64 v[17:18], v[21:22], 24 -; SI-NEXT: v_lshr_b64 v[11:12], v[21:22], 8 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v28 +; SI-NEXT: v_lshr_b64 v[19:20], v[5:6], 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v29 +; SI-NEXT: v_lshr_b64 v[0:1], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v30 +; SI-NEXT: v_lshr_b64 v[21:22], v[13:14], 16 +; SI-NEXT: v_mov_b32_e32 v1, v19 +; SI-NEXT: v_lshr_b64 v[8:9], v[25:26], 16 +; SI-NEXT: v_mov_b32_e32 v9, v21 +; SI-NEXT: v_lshr_b64 v[16:17], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v27 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v29 +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v21 +; SI-NEXT: v_lshr_b64 v[3:4], v[0:1], 24 +; SI-NEXT: v_lshr_b64 v[1:2], v[0:1], 8 +; SI-NEXT: v_lshr_b64 v[11:12], v[8:9], 24 +; SI-NEXT: v_lshr_b64 v[17:18], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[8:9], 8 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v21, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_alignbit_b32 v22, v14, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_alignbit_b32 v19, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 +; SI-NEXT: v_lshr_b64 v[8:9], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_alignbit_b32 v20, v6, v1, 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 24 -; SI-NEXT: v_lshr_b64 v[10:11], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[4:5], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 -; SI-NEXT: v_lshr_b64 v[17:18], v[21:22], 24 -; SI-NEXT: v_lshr_b64 v[11:12], v[21:22], 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v22 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[19:20], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mov_b32_e32 v1, v19 +; SI-NEXT: v_mov_b32_e32 v9, v21 +; SI-NEXT: v_lshr_b64 v[16:17], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[0:1], 24 +; SI-NEXT: v_lshr_b64 v[1:2], v[0:1], 8 +; SI-NEXT: v_lshr_b64 v[11:12], v[8:9], 24 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v10 +; SI-NEXT: v_lshr_b64 v[17:18], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[8:9], 8 +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v21 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 ; SI-NEXT: .LBB109_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v19 -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v4, v20 -; SI-NEXT: v_mov_b32_e32 v8, v21 -; SI-NEXT: v_mov_b32_e32 v9, v11 -; SI-NEXT: v_mov_b32_e32 v11, v17 -; SI-NEXT: v_mov_b32_e32 v12, v22 +; SI-NEXT: v_mov_b32_e32 v2, v16 +; SI-NEXT: v_mov_b32_e32 v4, v19 +; SI-NEXT: v_mov_b32_e32 v5, v20 +; SI-NEXT: v_mov_b32_e32 v10, v17 +; SI-NEXT: v_mov_b32_e32 v12, v21 +; SI-NEXT: v_mov_b32_e32 v13, v22 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v8bf16_to_v16i8_scalar: @@ -24628,142 +24653,143 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: s_cmp_lg_u32 s20, 0 ; VI-NEXT: s_cbranch_scc0 .LBB109_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s10, s19, 24 -; VI-NEXT: s_lshr_b32 s11, s19, 16 -; VI-NEXT: s_lshr_b32 s13, s19, 8 -; VI-NEXT: s_lshr_b32 s12, s18, 16 -; VI-NEXT: s_lshr_b32 s14, s18, 8 -; VI-NEXT: s_lshr_b32 s15, s17, 24 -; VI-NEXT: s_lshr_b32 s20, s17, 16 -; VI-NEXT: s_lshr_b32 s22, s17, 8 -; VI-NEXT: s_lshr_b32 s21, s16, 16 -; VI-NEXT: s_lshr_b32 s23, s16, 8 +; VI-NEXT: s_lshr_b32 s21, s19, 24 +; VI-NEXT: s_lshr_b32 s20, s19, 16 +; VI-NEXT: s_lshr_b32 s15, s19, 8 +; VI-NEXT: s_lshr_b32 s23, s18, 16 +; VI-NEXT: s_lshr_b32 s22, s18, 8 +; VI-NEXT: s_lshr_b32 s12, s17, 24 +; VI-NEXT: s_lshr_b32 s11, s17, 16 +; VI-NEXT: s_lshr_b32 s10, s17, 8 +; VI-NEXT: s_lshr_b32 s14, s16, 16 +; VI-NEXT: s_lshr_b32 s13, s16, 8 ; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 ; VI-NEXT: s_cbranch_execnz .LBB109_4 ; VI-NEXT: .LBB109_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v6, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v6 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v6 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v19, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v6 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v6 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v6 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v18, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v6 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v6 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v17, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v16, v0, v1, 16 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v6 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v9, v12 +; VI-NEXT: v_mov_b32_e32 v1, v4 +; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v0 ; VI-NEXT: s_branch .LBB109_5 ; VI-NEXT: .LBB109_3: -; VI-NEXT: ; implicit-def: $sgpr23 -; VI-NEXT: ; implicit-def: $sgpr21 -; VI-NEXT: ; implicit-def: $sgpr4 -; VI-NEXT: ; implicit-def: $sgpr22 -; VI-NEXT: ; implicit-def: $sgpr20 -; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr13 ; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr11 ; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr22 +; VI-NEXT: ; implicit-def: $sgpr23 ; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr13 -; VI-NEXT: ; implicit-def: $sgpr11 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr20 +; VI-NEXT: ; implicit-def: $sgpr21 ; VI-NEXT: s_branch .LBB109_2 ; VI-NEXT: .LBB109_4: -; VI-NEXT: v_mov_b32_e32 v18, s16 -; VI-NEXT: v_mov_b32_e32 v19, s17 -; VI-NEXT: v_mov_b32_e32 v16, s18 -; VI-NEXT: v_mov_b32_e32 v17, s19 -; VI-NEXT: v_mov_b32_e32 v1, s23 -; VI-NEXT: v_mov_b32_e32 v2, s21 -; VI-NEXT: v_mov_b32_e32 v5, s22 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: v_mov_b32_e32 v9, s14 -; VI-NEXT: v_mov_b32_e32 v10, s12 -; VI-NEXT: v_mov_b32_e32 v13, s13 -; VI-NEXT: v_mov_b32_e32 v14, s11 -; VI-NEXT: v_mov_b32_e32 v15, s10 -; VI-NEXT: v_mov_b32_e32 v11, s6 -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_mov_b32_e32 v15, s21 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_mov_b32_e32 v13, s15 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v11, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v6, s11 +; VI-NEXT: v_mov_b32_e32 v5, s10 +; VI-NEXT: v_mov_b32_e32 v16, s6 +; VI-NEXT: v_mov_b32_e32 v17, s4 ; VI-NEXT: .LBB109_5: ; %end -; VI-NEXT: v_mov_b32_e32 v0, v18 -; VI-NEXT: v_mov_b32_e32 v4, v19 -; VI-NEXT: v_mov_b32_e32 v8, v16 -; VI-NEXT: v_mov_b32_e32 v12, v17 +; VI-NEXT: v_mov_b32_e32 v3, v17 +; VI-NEXT: v_mov_b32_e32 v1, v11 +; VI-NEXT: v_mov_b32_e32 v11, v16 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v8bf16_to_v16i8_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index 01e397d629ea9..ccc6e9c7e9c16 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -4052,90 +4052,92 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 +; SI-NEXT: v_lshr_b64 v[0:1], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[12:13], 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 +; SI-NEXT: v_lshr_b64 v[32:33], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[10:11], 16 +; SI-NEXT: v_mov_b32_e32 v7, v32 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v7, v8 ; SI-NEXT: .LBB23_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: @@ -4151,150 +4153,153 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB23_4 ; VI-NEXT: .LBB23_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v8 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v8 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v11, v3, v5, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v2 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v13, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v8 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v7 +; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[0:1] +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[11:12] +; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[13:14] +; VI-NEXT: v_cndmask_b32_e64 v0, v5, v15, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v10 +; VI-NEXT: v_mov_b32_e32 v3, v9 +; VI-NEXT: v_mov_b32_e32 v5, v8 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB23_3: ; VI-NEXT: s_branch .LBB23_2 @@ -11204,90 +11209,92 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 +; SI-NEXT: v_lshr_b64 v[0:1], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[12:13], 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 +; SI-NEXT: v_lshr_b64 v[32:33], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[10:11], 16 +; SI-NEXT: v_mov_b32_e32 v7, v32 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v7, v8 ; SI-NEXT: .LBB47_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: @@ -11303,150 +11310,153 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; VI-NEXT: s_cbranch_execnz .LBB47_4 ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v8 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v8 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v11, v3, v5, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v2 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v13, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v8 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v7 +; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[0:1] +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[11:12] +; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[13:14] +; VI-NEXT: v_cndmask_b32_e64 v0, v5, v15, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v10 +; VI-NEXT: v_mov_b32_e32 v3, v9 +; VI-NEXT: v_mov_b32_e32 v5, v8 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB47_3: ; VI-NEXT: s_branch .LBB47_2 @@ -17924,90 +17934,92 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 +; SI-NEXT: v_lshr_b64 v[0:1], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[12:13], 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 +; SI-NEXT: v_lshr_b64 v[32:33], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[10:11], 16 +; SI-NEXT: v_mov_b32_e32 v7, v32 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v7, v8 ; SI-NEXT: .LBB67_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: @@ -18023,150 +18035,153 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB67_4 ; VI-NEXT: .LBB67_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v8 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v8 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v11, v3, v5, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v2 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v13, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v8 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v7 +; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[0:1] +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[11:12] +; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[13:14] +; VI-NEXT: v_cndmask_b32_e64 v0, v5, v15, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v10 +; VI-NEXT: v_mov_b32_e32 v3, v9 +; VI-NEXT: v_mov_b32_e32 v5, v8 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB67_3: ; VI-NEXT: s_branch .LBB67_2 @@ -24092,90 +24107,92 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 +; SI-NEXT: v_lshr_b64 v[0:1], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[12:13], 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 +; SI-NEXT: v_lshr_b64 v[32:33], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[10:11], 16 +; SI-NEXT: v_mov_b32_e32 v7, v32 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v7, v8 ; SI-NEXT: .LBB83_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB83_4: @@ -24191,150 +24208,153 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; VI-NEXT: s_cbranch_execnz .LBB83_4 ; VI-NEXT: .LBB83_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v8 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v8 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v11, v3, v5, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v2 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v13, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v8 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v7 +; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[0:1] +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[11:12] +; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[13:14] +; VI-NEXT: v_cndmask_b32_e64 v0, v5, v15, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v10 +; VI-NEXT: v_mov_b32_e32 v3, v9 +; VI-NEXT: v_mov_b32_e32 v5, v8 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB83_3: ; VI-NEXT: s_branch .LBB83_2 @@ -29754,119 +29774,123 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mul_f32_e64 v31, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v30, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v29, 1.0, s20 ; SI-NEXT: v_mul_f32_e64 v28, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v27, 1.0, s24 ; SI-NEXT: v_mul_f32_e64 v26, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s27 ; SI-NEXT: v_mul_f32_e64 v25, 1.0, s28 ; SI-NEXT: v_mul_f32_e64 v24, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v1 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_alignbit_b32 v8, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshr_b64 v[4:5], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v26 +; SI-NEXT: v_lshr_b64 v[12:13], v[11:12], 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v22 +; SI-NEXT: v_lshr_b64 v[8:9], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v10, v11, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v14, v15, v7, 16 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_alignbit_b32 v2, v3, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 -; SI-NEXT: v_lshr_b64 v[17:18], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[18:19], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[13:14], 16 -; SI-NEXT: v_alignbit_b32 v12, v24, v25, 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshr_b64 v[21:22], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v14, v16 +; SI-NEXT: v_mov_b32_e32 v10, v21 +; SI-NEXT: v_mov_b32_e32 v6, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; SI-NEXT: v_mov_b32_e32 v2, v17 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 ; SI-NEXT: .LBB95_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v17 -; SI-NEXT: v_mov_b32_e32 v5, v18 -; SI-NEXT: v_mov_b32_e32 v9, v21 -; SI-NEXT: v_mov_b32_e32 v13, v19 +; SI-NEXT: v_mov_b32_e32 v2, v17 +; SI-NEXT: v_mov_b32_e32 v6, v19 +; SI-NEXT: v_mov_b32_e32 v10, v21 +; SI-NEXT: v_mov_b32_e32 v14, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_branch .LBB95_2 ; ; VI-LABEL: bitcast_v16bf16_to_v16i16_scalar: @@ -29878,150 +29902,154 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; VI-NEXT: s_cbranch_execnz .LBB95_4 ; VI-NEXT: .LBB95_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v1 -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_mov_b32_e32 v10, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v10 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v10 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v10 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v1 +; VI-NEXT: v_add_f32_e32 v4, s4, v10 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v4, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v10 +; VI-NEXT: v_cndmask_b32_e32 v11, v5, v6, vcc ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s4, v1 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_add_f32_e32 v5, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v10, v6, v7, vcc +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; VI-NEXT: v_add_f32_e32 v4, s4, v10 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v5, s4, v10 ; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: s_and_b32 s5, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s4, v1 -; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v13, s5, v10 ; VI-NEXT: s_lshl_b32 s5, s22, 16 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_add_f32_e32 v6, s5, v1 -; VI-NEXT: v_cndmask_b32_e32 v11, v7, v11, vcc +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_add_f32_e32 v6, s5, v10 ; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 ; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc -; VI-NEXT: v_add_f32_e32 v7, s5, v1 -; VI-NEXT: v_bfe_u32 v12, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v7 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc -; VI-NEXT: s_lshl_b32 s5, s23, 16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; VI-NEXT: v_add_f32_e32 v7, s5, v1 -; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s5, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s5, v1 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v14, vcc +; VI-NEXT: v_add_f32_e32 v7, s5, v10 +; VI-NEXT: v_bfe_u32 v14, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v7 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v6, v12, v6, 16 -; VI-NEXT: v_add_f32_e32 v12, s4, v1 -; VI-NEXT: v_alignbit_b32 v7, v13, v7, 16 -; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] +; VI-NEXT: v_cndmask_b32_e32 v7, v14, v15, vcc +; VI-NEXT: v_bfe_u32 v5, v13, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v13 ; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s4, v1 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_bfe_u32 v14, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v1 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, s4, v10 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v7, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v13 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v13, v7, v14, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v10 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v15, vcc +; VI-NEXT: v_add_f32_e32 v7, s4, v10 +; VI-NEXT: v_bfe_u32 v10, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v7 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v15, v10, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v5, v1, v13, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v11, 16 -; VI-NEXT: v_alignbit_b32 v3, v3, v10, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v9, 16 -; VI-NEXT: v_alignbit_b32 v1, v15, v8, 16 -; VI-NEXT: v_alignbit_b32 v0, v14, v0, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[13:14], 16, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v7 +; VI-NEXT: v_mov_b32_e32 v3, v10 +; VI-NEXT: v_mov_b32_e32 v5, v13 +; VI-NEXT: v_mov_b32_e32 v7, v15 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB95_3: ; VI-NEXT: s_branch .LBB95_2 @@ -35136,150 +35164,154 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg % ; VI-NEXT: s_cbranch_execnz .LBB103_4 ; VI-NEXT: .LBB103_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v1 -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_mov_b32_e32 v10, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v10 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v10 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v10 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v1 +; VI-NEXT: v_add_f32_e32 v4, s4, v10 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v4, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v10 +; VI-NEXT: v_cndmask_b32_e32 v11, v5, v6, vcc ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s4, v1 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_add_f32_e32 v5, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v10, v6, v7, vcc +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; VI-NEXT: v_add_f32_e32 v4, s4, v10 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v5, s4, v10 ; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: s_and_b32 s5, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s4, v1 -; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v13, s5, v10 ; VI-NEXT: s_lshl_b32 s5, s22, 16 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_add_f32_e32 v6, s5, v1 -; VI-NEXT: v_cndmask_b32_e32 v11, v7, v11, vcc +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_add_f32_e32 v6, s5, v10 ; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 ; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc -; VI-NEXT: v_add_f32_e32 v7, s5, v1 -; VI-NEXT: v_bfe_u32 v12, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v7 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc -; VI-NEXT: s_lshl_b32 s5, s23, 16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; VI-NEXT: v_add_f32_e32 v7, s5, v1 -; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s5, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s5, v1 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v14, vcc +; VI-NEXT: v_add_f32_e32 v7, s5, v10 +; VI-NEXT: v_bfe_u32 v14, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v7 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v6, v12, v6, 16 -; VI-NEXT: v_add_f32_e32 v12, s4, v1 -; VI-NEXT: v_alignbit_b32 v7, v13, v7, 16 -; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] +; VI-NEXT: v_cndmask_b32_e32 v7, v14, v15, vcc +; VI-NEXT: v_bfe_u32 v5, v13, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v13 ; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s4, v1 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_bfe_u32 v14, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v1 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, s4, v10 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v7, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v13 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v13, v7, v14, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v10 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v15, vcc +; VI-NEXT: v_add_f32_e32 v7, s4, v10 +; VI-NEXT: v_bfe_u32 v10, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v7 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v15, v10, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v5, v1, v13, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v11, 16 -; VI-NEXT: v_alignbit_b32 v3, v3, v10, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v9, 16 -; VI-NEXT: v_alignbit_b32 v1, v15, v8, 16 -; VI-NEXT: v_alignbit_b32 v0, v14, v0, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[13:14], 16, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v7 +; VI-NEXT: v_mov_b32_e32 v3, v10 +; VI-NEXT: v_mov_b32_e32 v5, v13 +; VI-NEXT: v_mov_b32_e32 v7, v15 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB103_3: ; VI-NEXT: s_branch .LBB103_2 @@ -40103,186 +40135,217 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v24, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s25 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mul_f32_e64 v40, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s26 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e64 v42, 1.0, s29 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v40, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s22 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v44, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v43, 1.0, s28 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v46, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_alignbit_b32 v48, v1, v32, 16 -; SI-NEXT: v_alignbit_b32 v49, v6, v16, 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_alignbit_b32 v37, v2, v52, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_alignbit_b32 v35, v2, v40, 16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v41 -; SI-NEXT: v_alignbit_b32 v38, v14, v50, 16 -; SI-NEXT: v_alignbit_b32 v36, v22, v54, 16 -; SI-NEXT: v_alignbit_b32 v33, v2, v43, 16 -; SI-NEXT: v_alignbit_b32 v34, v30, v0, 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 -; SI-NEXT: v_lshr_b64 v[11:12], v[37:38], 24 -; SI-NEXT: v_lshr_b64 v[19:20], v[35:36], 24 -; SI-NEXT: v_lshr_b64 v[27:28], v[33:34], 24 -; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[12:13], v[37:38], 16 -; SI-NEXT: v_lshr_b64 v[20:21], v[35:36], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[33:34], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[37:38], 8 -; SI-NEXT: v_lshr_b64 v[17:18], v[35:36], 8 -; SI-NEXT: v_lshr_b64 v[25:26], v[33:34], 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v39 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v38 -; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v53 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v36 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v41 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v56 +; SI-NEXT: v_lshr_b64 v[35:36], v[5:6], 16 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v58 +; SI-NEXT: v_lshr_b64 v[38:39], v[13:14], 16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v60 +; SI-NEXT: v_lshr_b64 v[51:52], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v61 +; SI-NEXT: v_lshr_b64 v[0:1], v[40:41], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[42:43], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[44:45], 16 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v62 +; SI-NEXT: v_lshr_b64 v[53:54], v[29:30], 16 +; SI-NEXT: v_mov_b32_e32 v1, v35 +; SI-NEXT: v_mov_b32_e32 v9, v38 +; SI-NEXT: v_mov_b32_e32 v17, v51 +; SI-NEXT: v_lshr_b64 v[24:25], v[46:47], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[8:9], 24 +; SI-NEXT: v_lshr_b64 v[36:37], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[8:9], 8 +; SI-NEXT: v_mov_b32_e32 v25, v53 +; SI-NEXT: v_lshr_b64 v[48:49], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[0:1], 24 +; SI-NEXT: v_lshr_b64 v[1:2], v[0:1], 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v55 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v57 +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v35 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v59 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v38 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v61 +; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v51 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v53 +; SI-NEXT: v_lshr_b64 v[19:20], v[16:17], 24 +; SI-NEXT: v_lshr_b64 v[17:18], v[16:17], 8 +; SI-NEXT: v_lshr_b64 v[27:28], v[24:25], 24 +; SI-NEXT: v_lshr_b64 v[49:50], v[24:25], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[24:25], 8 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 -; SI-NEXT: v_alignbit_b32 v34, v30, v0, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v2 +; SI-NEXT: v_lshr_b64 v[24:25], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v35, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 -; SI-NEXT: v_alignbit_b32 v36, v22, v0, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 +; SI-NEXT: v_lshr_b64 v[16:17], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v50 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_lshr_b64 v[8:9], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 +; SI-NEXT: v_lshr_b64 v[51:52], v[21:22], 16 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v38, v14, v0, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_lshr_b64 v[53:54], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[13:14], 16 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v48, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_alignbit_b32 v49, v6, v0, 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 -; SI-NEXT: v_lshr_b64 v[11:12], v[37:38], 24 -; SI-NEXT: v_lshr_b64 v[19:20], v[35:36], 24 -; SI-NEXT: v_lshr_b64 v[27:28], v[33:34], 24 -; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 -; SI-NEXT: v_lshr_b64 v[12:13], v[37:38], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[37:38], 8 -; SI-NEXT: v_lshr_b64 v[20:21], v[35:36], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[35:36], 8 -; SI-NEXT: v_lshr_b64 v[28:29], v[33:34], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[33:34], 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v38 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v36 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34 +; SI-NEXT: v_lshr_b64 v[35:36], v[5:6], 16 +; SI-NEXT: v_mov_b32_e32 v17, v51 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mov_b32_e32 v25, v53 +; SI-NEXT: v_mov_b32_e32 v9, v38 +; SI-NEXT: v_mov_b32_e32 v1, v35 +; SI-NEXT: v_lshr_b64 v[48:49], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[0:1], 24 +; SI-NEXT: v_lshr_b64 v[33:34], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[0:1], 8 +; SI-NEXT: v_lshr_b64 v[11:12], v[8:9], 24 +; SI-NEXT: v_lshr_b64 v[36:37], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[8:9], 8 +; SI-NEXT: v_lshr_b64 v[19:20], v[16:17], 24 +; SI-NEXT: v_lshr_b64 v[17:18], v[16:17], 8 +; SI-NEXT: v_lshr_b64 v[27:28], v[24:25], 24 +; SI-NEXT: v_lshr_b64 v[49:50], v[24:25], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[24:25], 8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v35 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v38 +; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v51 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v53 ; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 ; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 ; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v23 ; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 ; SI-NEXT: .LBB109_3: ; %end -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v0, v48 -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v4, v49 -; SI-NEXT: v_mov_b32_e32 v8, v37 -; SI-NEXT: v_mov_b32_e32 v10, v12 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v2, v33 +; SI-NEXT: v_mov_b32_e32 v4, v35 +; SI-NEXT: v_mov_b32_e32 v5, v10 +; SI-NEXT: v_mov_b32_e32 v10, v36 ; SI-NEXT: v_mov_b32_e32 v12, v38 -; SI-NEXT: v_mov_b32_e32 v16, v35 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: v_mov_b32_e32 v20, v36 -; SI-NEXT: v_mov_b32_e32 v24, v33 -; SI-NEXT: v_mov_b32_e32 v26, v28 -; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v13, v34 +; SI-NEXT: v_mov_b32_e32 v18, v48 +; SI-NEXT: v_mov_b32_e32 v20, v51 +; SI-NEXT: v_mov_b32_e32 v21, v32 +; SI-NEXT: v_mov_b32_e32 v26, v49 +; SI-NEXT: v_mov_b32_e32 v28, v53 +; SI-NEXT: v_mov_b32_e32 v29, v37 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v16bf16_to_v32i8_scalar: @@ -40291,26 +40354,26 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: s_cmp_lg_u32 s24, 0 ; VI-NEXT: s_cbranch_scc0 .LBB109_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s14, s23, 24 -; VI-NEXT: s_lshr_b32 s15, s23, 16 -; VI-NEXT: s_lshr_b32 s25, s23, 8 -; VI-NEXT: s_lshr_b32 s24, s22, 16 -; VI-NEXT: s_lshr_b32 s26, s22, 8 -; VI-NEXT: s_lshr_b32 s27, s21, 24 -; VI-NEXT: s_lshr_b32 s28, s21, 16 -; VI-NEXT: s_lshr_b32 s40, s21, 8 -; VI-NEXT: s_lshr_b32 s29, s20, 16 -; VI-NEXT: s_lshr_b32 s41, s20, 8 -; VI-NEXT: s_lshr_b32 s42, s19, 24 -; VI-NEXT: s_lshr_b32 s43, s19, 16 -; VI-NEXT: s_lshr_b32 s45, s19, 8 -; VI-NEXT: s_lshr_b32 s44, s18, 16 -; VI-NEXT: s_lshr_b32 s46, s18, 8 -; VI-NEXT: s_lshr_b32 s47, s17, 24 -; VI-NEXT: s_lshr_b32 s56, s17, 16 -; VI-NEXT: s_lshr_b32 s58, s17, 8 -; VI-NEXT: s_lshr_b32 s57, s16, 16 -; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: s_lshr_b32 s57, s23, 24 +; VI-NEXT: s_lshr_b32 s56, s23, 16 +; VI-NEXT: s_lshr_b32 s47, s23, 8 +; VI-NEXT: s_lshr_b32 s59, s22, 16 +; VI-NEXT: s_lshr_b32 s58, s22, 8 +; VI-NEXT: s_lshr_b32 s44, s21, 24 +; VI-NEXT: s_lshr_b32 s43, s21, 16 +; VI-NEXT: s_lshr_b32 s42, s21, 8 +; VI-NEXT: s_lshr_b32 s46, s20, 16 +; VI-NEXT: s_lshr_b32 s45, s20, 8 +; VI-NEXT: s_lshr_b32 s29, s19, 24 +; VI-NEXT: s_lshr_b32 s28, s19, 16 +; VI-NEXT: s_lshr_b32 s27, s19, 8 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s40, s18, 8 +; VI-NEXT: s_lshr_b32 s24, s17, 24 +; VI-NEXT: s_lshr_b32 s15, s17, 16 +; VI-NEXT: s_lshr_b32 s14, s17, 8 +; VI-NEXT: s_lshr_b32 s26, s16, 16 +; VI-NEXT: s_lshr_b32 s25, s16, 8 ; VI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 ; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 ; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 @@ -40336,225 +40399,225 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] ; VI-NEXT: v_add_f32_e32 v0, s4, v2 -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v2 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v0, v3, v0, 16 ; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v2 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v9, v4, v3, 16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v2 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[5:6] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v8, v4, v3, 16 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v2 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[5:6] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v17, v4, v3, 16 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v2 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[5:6] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v16, v4, v3, 16 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v2 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[5:6] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc ; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v25, v4, v3, 16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[5:6] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v2, s4, v2 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v24, v2, v3, 16 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v25 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v25 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v25, v28 +; VI-NEXT: v_mov_b32_e32 v1, v4 +; VI-NEXT: v_mov_b32_e32 v9, v12 +; VI-NEXT: v_mov_b32_e32 v17, v20 +; VI-NEXT: v_lshrrev_b64 v[36:37], 24, v[24:25] +; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[34:35], 24, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v24 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v17 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v24 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v9 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_branch .LBB109_5 ; VI-NEXT: .LBB109_3: -; VI-NEXT: ; implicit-def: $sgpr59 -; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr4 -; VI-NEXT: ; implicit-def: $sgpr58 -; VI-NEXT: ; implicit-def: $sgpr56 -; VI-NEXT: ; implicit-def: $sgpr47 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr44 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr45 -; VI-NEXT: ; implicit-def: $sgpr43 -; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr40 ; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; implicit-def: $sgpr8 -; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr28 -; VI-NEXT: ; implicit-def: $sgpr27 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr59 ; VI-NEXT: ; implicit-def: $sgpr10 -; VI-NEXT: ; implicit-def: $sgpr25 -; VI-NEXT: ; implicit-def: $sgpr15 -; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr57 ; VI-NEXT: s_branch .LBB109_2 ; VI-NEXT: .LBB109_4: -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v8, s18 -; VI-NEXT: v_mov_b32_e32 v9, s19 -; VI-NEXT: v_mov_b32_e32 v16, s20 -; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v26, s59 +; VI-NEXT: v_mov_b32_e32 v25, s58 +; VI-NEXT: v_mov_b32_e32 v31, s57 +; VI-NEXT: v_mov_b32_e32 v30, s56 +; VI-NEXT: v_mov_b32_e32 v29, s47 +; VI-NEXT: v_mov_b32_e32 v18, s46 +; VI-NEXT: v_mov_b32_e32 v17, s45 +; VI-NEXT: v_mov_b32_e32 v23, s44 +; VI-NEXT: v_mov_b32_e32 v22, s43 +; VI-NEXT: v_mov_b32_e32 v21, s42 +; VI-NEXT: v_mov_b32_e32 v10, s41 +; VI-NEXT: v_mov_b32_e32 v9, s40 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v2, s26 +; VI-NEXT: v_mov_b32_e32 v1, s25 +; VI-NEXT: v_mov_b32_e32 v7, s24 +; VI-NEXT: v_mov_b32_e32 v6, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 ; VI-NEXT: v_mov_b32_e32 v24, s22 -; VI-NEXT: v_mov_b32_e32 v25, s23 -; VI-NEXT: v_mov_b32_e32 v35, s59 -; VI-NEXT: v_mov_b32_e32 v2, s57 -; VI-NEXT: v_mov_b32_e32 v5, s58 -; VI-NEXT: v_mov_b32_e32 v6, s56 -; VI-NEXT: v_mov_b32_e32 v7, s47 -; VI-NEXT: v_mov_b32_e32 v34, s46 -; VI-NEXT: v_mov_b32_e32 v10, s44 -; VI-NEXT: v_mov_b32_e32 v13, s45 -; VI-NEXT: v_mov_b32_e32 v14, s43 -; VI-NEXT: v_mov_b32_e32 v15, s42 -; VI-NEXT: v_mov_b32_e32 v33, s41 -; VI-NEXT: v_mov_b32_e32 v18, s29 -; VI-NEXT: v_mov_b32_e32 v21, s40 -; VI-NEXT: v_mov_b32_e32 v22, s28 -; VI-NEXT: v_mov_b32_e32 v23, s27 -; VI-NEXT: v_mov_b32_e32 v32, s26 -; VI-NEXT: v_mov_b32_e32 v26, s24 -; VI-NEXT: v_mov_b32_e32 v29, s25 -; VI-NEXT: v_mov_b32_e32 v30, s15 -; VI-NEXT: v_mov_b32_e32 v31, s14 -; VI-NEXT: v_mov_b32_e32 v27, s10 -; VI-NEXT: v_mov_b32_e32 v19, s8 -; VI-NEXT: v_mov_b32_e32 v11, s6 -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v28, s23 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v20, s21 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v36, s10 +; VI-NEXT: v_mov_b32_e32 v37, s8 +; VI-NEXT: v_mov_b32_e32 v34, s6 +; VI-NEXT: v_mov_b32_e32 v32, s4 ; VI-NEXT: .LBB109_5: ; %end -; VI-NEXT: v_mov_b32_e32 v4, v1 -; VI-NEXT: v_mov_b32_e32 v12, v9 -; VI-NEXT: v_mov_b32_e32 v20, v17 -; VI-NEXT: v_mov_b32_e32 v28, v25 -; VI-NEXT: v_mov_b32_e32 v1, v35 -; VI-NEXT: v_mov_b32_e32 v9, v34 -; VI-NEXT: v_mov_b32_e32 v17, v33 -; VI-NEXT: v_mov_b32_e32 v25, v32 +; VI-NEXT: v_mov_b32_e32 v3, v32 +; VI-NEXT: v_mov_b32_e32 v11, v34 +; VI-NEXT: v_mov_b32_e32 v19, v37 +; VI-NEXT: v_mov_b32_e32 v27, v36 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v16bf16_to_v32i8_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll index 73b57a52201af..8055ea8be5261 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll @@ -1392,20 +1392,20 @@ define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[2:3], 16 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: .LBB15_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: @@ -1421,24 +1421,24 @@ define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg ; VI-NEXT: s_cbranch_execnz .LBB15_4 ; VI-NEXT: .LBB15_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 ; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB15_3: ; VI-NEXT: s_branch .LBB15_2 @@ -3671,20 +3671,20 @@ define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[2:3], 16 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: .LBB35_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: @@ -3700,24 +3700,24 @@ define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inre ; VI-NEXT: s_cbranch_execnz .LBB35_4 ; VI-NEXT: .LBB35_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 ; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB35_3: ; VI-NEXT: s_branch .LBB35_2 @@ -5581,24 +5581,25 @@ define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s17 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 ; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB51_2 ; @@ -5611,24 +5612,24 @@ define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB51_4 ; VI-NEXT: .LBB51_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 ; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_3: ; VI-NEXT: s_branch .LBB51_2 @@ -7278,24 +7279,24 @@ define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i ; VI-NEXT: s_cbranch_execnz .LBB63_4 ; VI-NEXT: .LBB63_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 ; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB63_3: ; VI-NEXT: s_branch .LBB63_2 @@ -8720,20 +8721,20 @@ define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 ; SI-NEXT: s_cbranch_scc0 .LBB73_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[2:3], 16 ; SI-NEXT: s_cbranch_execnz .LBB73_3 ; SI-NEXT: .LBB73_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: .LBB73_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB73_4: @@ -8749,24 +8750,24 @@ define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB73_4 ; VI-NEXT: .LBB73_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 ; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB73_3: ; VI-NEXT: s_branch .LBB73_2 @@ -9336,30 +9337,31 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 ; SI-NEXT: s_cbranch_scc0 .LBB77_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v2, v5, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; SI-NEXT: s_cbranch_execnz .LBB77_3 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; SI-NEXT: .LBB77_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v4 +; SI-NEXT: v_mov_b32_e32 v1, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB77_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_branch .LBB77_2 ; @@ -9369,9 +9371,9 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32 ; VI-NEXT: s_cmp_lg_u32 s17, 0 ; VI-NEXT: s_cbranch_scc0 .LBB77_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s7, s16, 24 -; VI-NEXT: s_lshr_b32 s6, s16, 16 -; VI-NEXT: s_lshr_b32 s8, s16, 8 +; VI-NEXT: s_lshr_b32 s6, s16, 24 +; VI-NEXT: s_lshr_b32 s8, s16, 16 +; VI-NEXT: s_lshr_b32 s7, s16, 8 ; VI-NEXT: s_cbranch_execnz .LBB77_4 ; VI-NEXT: .LBB77_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 @@ -9392,21 +9394,21 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB77_3: +; VI-NEXT: ; implicit-def: $sgpr7 ; VI-NEXT: ; implicit-def: $sgpr8 ; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr7 ; VI-NEXT: s_branch .LBB77_2 ; VI-NEXT: .LBB77_4: -; VI-NEXT: v_mov_b32_e32 v1, s8 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v2bf16_to_v4i8_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll index d5d2d4aafaa19..08038b90687c0 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll @@ -290,34 +290,34 @@ define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i ; VI-NEXT: s_cbranch_execnz .LBB1_4 ; VI-NEXT: .LBB1_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB1_3: ; VI-NEXT: s_branch .LBB1_2 @@ -964,16 +964,16 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3 ; SI-NEXT: s_cbranch_execnz .LBB5_3 ; SI-NEXT: .LBB5_2: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshr_b64 v[3:4], v[1:2], 16 -; SI-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; SI-NEXT: .LBB5_3: ; %end ; SI-NEXT: v_mov_b32_e32 v1, v3 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -992,34 +992,34 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB5_4 ; VI-NEXT: .LBB5_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB5_3: ; VI-NEXT: s_branch .LBB5_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index c8d176237815f..f482843af010a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -6495,172 +6495,211 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 +; SI-NEXT: v_lshr_b64 v[0:1], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[52:53], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 +; SI-NEXT: v_lshr_b64 v[2:3], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v62 +; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 +; SI-NEXT: v_lshr_b64 v[4:5], v[38:39], 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v58 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; SI-NEXT: v_lshr_b64 v[6:7], v[34:35], 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 +; SI-NEXT: v_lshr_b64 v[7:8], v[32:33], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v47 +; SI-NEXT: v_lshr_b64 v[8:9], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v46 +; SI-NEXT: v_lshr_b64 v[9:10], v[28:29], 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v45 +; SI-NEXT: v_lshr_b64 v[10:11], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v44 +; SI-NEXT: v_lshr_b64 v[11:12], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v43 +; SI-NEXT: v_lshr_b64 v[12:13], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v42 +; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v15, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshr_b64 v[39:40], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v17, v15 +; SI-NEXT: v_mov_b32_e32 v15, v39 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v15, v16 ; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -6670,11 +6709,11 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v19, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s30, 0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s31, 1 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s31, v1 @@ -6682,295 +6721,303 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB23_4 ; VI-NEXT: .LBB23_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 ; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v16 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[0:1] +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: s_and_b32 s5, s26, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s24, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s20, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s18, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; VI-NEXT: s_lshl_b32 s6, s19, 16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_cndmask_b32_e64 v17, v1, v5, s[4:5] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v9, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v3 +; VI-NEXT: s_lshl_b32 s6, s21, 16 +; VI-NEXT: v_mov_b32_e32 v1, v17 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_cndmask_b32_e64 v17, v5, v7, s[4:5] +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_bfe_u32 v11, v5, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: s_lshl_b32 s6, s23, 16 +; VI-NEXT: v_mov_b32_e32 v3, v17 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e64 v17, v7, v9, s[4:5] +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v7, v7 +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 +; VI-NEXT: s_lshl_b32 s6, s25, 16 +; VI-NEXT: v_mov_b32_e32 v5, v17 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_cndmask_b32_e64 v17, v9, v11, s[4:5] +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v9, v9 +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_bfe_u32 v15, v9, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v7, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v15, v17, vcc +; VI-NEXT: s_lshl_b32 s6, s27, 16 +; VI-NEXT: v_cndmask_b32_e64 v17, v11, v13, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v11, v11 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_mov_b32_e32 v9, v17 +; VI-NEXT: v_bfe_u32 v17, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s7, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; VI-NEXT: v_cndmask_b32_e64 v17, v13, v15, s[4:5] +; VI-NEXT: v_add_f32_e32 v13, s7, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: s_lshl_b32 s6, s31, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v16 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[17:18] +; VI-NEXT: v_mov_b32_e32 v13, v15 +; VI-NEXT: v_mov_b32_e32 v15, v16 ; VI-NEXT: s_branch .LBB23_5 ; VI-NEXT: .LBB23_3: ; VI-NEXT: s_branch .LBB23_2 @@ -6992,10 +7039,10 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v14, s30 ; VI-NEXT: v_mov_b32_e32 v15, s31 ; VI-NEXT: .LBB23_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v19, 1 -; VI-NEXT: v_readlane_b32 s30, v19, 0 +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -21380,172 +21427,211 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 +; SI-NEXT: v_lshr_b64 v[0:1], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[52:53], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 +; SI-NEXT: v_lshr_b64 v[2:3], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v62 +; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 +; SI-NEXT: v_lshr_b64 v[4:5], v[38:39], 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v58 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; SI-NEXT: v_lshr_b64 v[6:7], v[34:35], 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 +; SI-NEXT: v_lshr_b64 v[7:8], v[32:33], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v47 +; SI-NEXT: v_lshr_b64 v[8:9], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v46 +; SI-NEXT: v_lshr_b64 v[9:10], v[28:29], 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v45 +; SI-NEXT: v_lshr_b64 v[10:11], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v44 +; SI-NEXT: v_lshr_b64 v[11:12], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v43 +; SI-NEXT: v_lshr_b64 v[12:13], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v42 +; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v15, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshr_b64 v[39:40], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v17, v15 +; SI-NEXT: v_mov_b32_e32 v15, v39 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v15, v16 ; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -21555,11 +21641,11 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v19, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s30, 0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s31, 1 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s31, v1 @@ -21567,295 +21653,303 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB47_4 ; VI-NEXT: .LBB47_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 ; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v16 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[0:1] +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: s_and_b32 s5, s26, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s24, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s20, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s18, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; VI-NEXT: s_lshl_b32 s6, s19, 16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_cndmask_b32_e64 v17, v1, v5, s[4:5] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v9, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v3 +; VI-NEXT: s_lshl_b32 s6, s21, 16 +; VI-NEXT: v_mov_b32_e32 v1, v17 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_cndmask_b32_e64 v17, v5, v7, s[4:5] +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_bfe_u32 v11, v5, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: s_lshl_b32 s6, s23, 16 +; VI-NEXT: v_mov_b32_e32 v3, v17 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e64 v17, v7, v9, s[4:5] +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v7, v7 +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 +; VI-NEXT: s_lshl_b32 s6, s25, 16 +; VI-NEXT: v_mov_b32_e32 v5, v17 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_cndmask_b32_e64 v17, v9, v11, s[4:5] +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v9, v9 +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_bfe_u32 v15, v9, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v7, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v15, v17, vcc +; VI-NEXT: s_lshl_b32 s6, s27, 16 +; VI-NEXT: v_cndmask_b32_e64 v17, v11, v13, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v11, v11 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_mov_b32_e32 v9, v17 +; VI-NEXT: v_bfe_u32 v17, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s7, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; VI-NEXT: v_cndmask_b32_e64 v17, v13, v15, s[4:5] +; VI-NEXT: v_add_f32_e32 v13, s7, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: s_lshl_b32 s6, s31, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v16 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[17:18] +; VI-NEXT: v_mov_b32_e32 v13, v15 +; VI-NEXT: v_mov_b32_e32 v15, v16 ; VI-NEXT: s_branch .LBB47_5 ; VI-NEXT: .LBB47_3: ; VI-NEXT: s_branch .LBB47_2 @@ -21877,10 +21971,10 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_mov_b32_e32 v14, s30 ; VI-NEXT: v_mov_b32_e32 v15, s31 ; VI-NEXT: .LBB47_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v19, 1 -; VI-NEXT: v_readlane_b32 s30, v19, 0 +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -35773,172 +35867,211 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 +; SI-NEXT: v_lshr_b64 v[0:1], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[52:53], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 +; SI-NEXT: v_lshr_b64 v[2:3], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v62 +; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 +; SI-NEXT: v_lshr_b64 v[4:5], v[38:39], 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v58 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; SI-NEXT: v_lshr_b64 v[6:7], v[34:35], 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 +; SI-NEXT: v_lshr_b64 v[7:8], v[32:33], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v47 +; SI-NEXT: v_lshr_b64 v[8:9], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v46 +; SI-NEXT: v_lshr_b64 v[9:10], v[28:29], 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v45 +; SI-NEXT: v_lshr_b64 v[10:11], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v44 +; SI-NEXT: v_lshr_b64 v[11:12], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v43 +; SI-NEXT: v_lshr_b64 v[12:13], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v42 +; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v15, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshr_b64 v[39:40], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v17, v15 +; SI-NEXT: v_mov_b32_e32 v15, v39 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v15, v16 ; SI-NEXT: .LBB67_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -35948,11 +36081,11 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v19, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s30, 0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s31, 1 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s31, v1 @@ -35960,295 +36093,303 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB67_4 ; VI-NEXT: .LBB67_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 ; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v16 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[0:1] +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: s_and_b32 s5, s26, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s24, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s20, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s18, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; VI-NEXT: s_lshl_b32 s6, s19, 16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_cndmask_b32_e64 v17, v1, v5, s[4:5] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v9, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v3 +; VI-NEXT: s_lshl_b32 s6, s21, 16 +; VI-NEXT: v_mov_b32_e32 v1, v17 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_cndmask_b32_e64 v17, v5, v7, s[4:5] +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_bfe_u32 v11, v5, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: s_lshl_b32 s6, s23, 16 +; VI-NEXT: v_mov_b32_e32 v3, v17 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e64 v17, v7, v9, s[4:5] +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v7, v7 +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 +; VI-NEXT: s_lshl_b32 s6, s25, 16 +; VI-NEXT: v_mov_b32_e32 v5, v17 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_cndmask_b32_e64 v17, v9, v11, s[4:5] +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v9, v9 +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_bfe_u32 v15, v9, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v7, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v15, v17, vcc +; VI-NEXT: s_lshl_b32 s6, s27, 16 +; VI-NEXT: v_cndmask_b32_e64 v17, v11, v13, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v11, v11 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_mov_b32_e32 v9, v17 +; VI-NEXT: v_bfe_u32 v17, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s7, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; VI-NEXT: v_cndmask_b32_e64 v17, v13, v15, s[4:5] +; VI-NEXT: v_add_f32_e32 v13, s7, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: s_lshl_b32 s6, s31, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v16 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[17:18] +; VI-NEXT: v_mov_b32_e32 v13, v15 +; VI-NEXT: v_mov_b32_e32 v15, v16 ; VI-NEXT: s_branch .LBB67_5 ; VI-NEXT: .LBB67_3: ; VI-NEXT: s_branch .LBB67_2 @@ -36270,10 +36411,10 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_mov_b32_e32 v14, s30 ; VI-NEXT: v_mov_b32_e32 v15, s31 ; VI-NEXT: .LBB67_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v19, 1 -; VI-NEXT: v_readlane_b32 s30, v19, 0 +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -49226,172 +49367,211 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 +; SI-NEXT: v_lshr_b64 v[0:1], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[52:53], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 +; SI-NEXT: v_lshr_b64 v[2:3], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v62 +; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 +; SI-NEXT: v_lshr_b64 v[4:5], v[38:39], 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v58 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; SI-NEXT: v_lshr_b64 v[6:7], v[34:35], 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 +; SI-NEXT: v_lshr_b64 v[7:8], v[32:33], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v47 +; SI-NEXT: v_lshr_b64 v[8:9], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v46 +; SI-NEXT: v_lshr_b64 v[9:10], v[28:29], 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v45 +; SI-NEXT: v_lshr_b64 v[10:11], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v44 +; SI-NEXT: v_lshr_b64 v[11:12], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v43 +; SI-NEXT: v_lshr_b64 v[12:13], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v42 +; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v15, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshr_b64 v[39:40], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v17, v15 +; SI-NEXT: v_mov_b32_e32 v15, v39 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v15, v16 ; SI-NEXT: .LBB83_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB83_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -49401,11 +49581,11 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v19, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s30, 0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s31, 1 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s31, v1 @@ -49413,295 +49593,303 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB83_4 ; VI-NEXT: .LBB83_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 ; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v16 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[0:1] +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: s_and_b32 s5, s26, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s24, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s20, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s18, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; VI-NEXT: s_lshl_b32 s6, s19, 16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_cndmask_b32_e64 v17, v1, v5, s[4:5] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v9, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v3 +; VI-NEXT: s_lshl_b32 s6, s21, 16 +; VI-NEXT: v_mov_b32_e32 v1, v17 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_cndmask_b32_e64 v17, v5, v7, s[4:5] +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_bfe_u32 v11, v5, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: s_lshl_b32 s6, s23, 16 +; VI-NEXT: v_mov_b32_e32 v3, v17 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e64 v17, v7, v9, s[4:5] +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v7, v7 +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 +; VI-NEXT: s_lshl_b32 s6, s25, 16 +; VI-NEXT: v_mov_b32_e32 v5, v17 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_cndmask_b32_e64 v17, v9, v11, s[4:5] +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v9, v9 +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_bfe_u32 v15, v9, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v7, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v15, v17, vcc +; VI-NEXT: s_lshl_b32 s6, s27, 16 +; VI-NEXT: v_cndmask_b32_e64 v17, v11, v13, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v11, v11 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_mov_b32_e32 v9, v17 +; VI-NEXT: v_bfe_u32 v17, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s7, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; VI-NEXT: v_cndmask_b32_e64 v17, v13, v15, s[4:5] +; VI-NEXT: v_add_f32_e32 v13, s7, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: s_lshl_b32 s6, s31, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v16 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[17:18] +; VI-NEXT: v_mov_b32_e32 v13, v15 +; VI-NEXT: v_mov_b32_e32 v15, v16 ; VI-NEXT: s_branch .LBB83_5 ; VI-NEXT: .LBB83_3: ; VI-NEXT: s_branch .LBB83_2 @@ -49723,10 +49911,10 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_mov_b32_e32 v14, s30 ; VI-NEXT: v_mov_b32_e32 v15, s31 ; VI-NEXT: .LBB83_5: ; %end -; VI-NEXT: v_readlane_b32 s31, v19, 1 -; VI-NEXT: v_readlane_b32 s30, v19, 0 +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -61974,185 +62162,193 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_mul_f32_e64 v57, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v56, 1.0, s17 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v47, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v46, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v45, 1.0, v6 ; SI-NEXT: v_mul_f32_e32 v44, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v43, 1.0, v10 ; SI-NEXT: v_mul_f32_e32 v42, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v41, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s19 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v63, 1.0, s20 ; SI-NEXT: v_mul_f32_e64 v62, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v61, 1.0, s24 ; SI-NEXT: v_mul_f32_e64 v60, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v40, 1.0, s27 ; SI-NEXT: v_mul_f32_e64 v59, 1.0, s28 ; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v34 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v40 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v38 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v49 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v51 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v53 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v46 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v40 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshr_b64 v[4:5], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v60 -; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_alignbit_b32 v8, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v59 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 +; SI-NEXT: v_lshr_b64 v[8:9], v[7:8], 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v45 +; SI-NEXT: v_lshr_b64 v[12:13], v[11:12], 16 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v47 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 +; SI-NEXT: v_lshr_b64 v[20:21], v[19:20], 16 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_alignbit_b32 v12, v10, v2, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_alignbit_b32 v16, v11, v2, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v43 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v42 +; SI-NEXT: v_lshr_b64 v[28:29], v[27:28], 16 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[16:17], v[15:16], 16 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; SI-NEXT: v_alignbit_b32 v20, v14, v2, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_alignbit_b32 v24, v15, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v29 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[24:25], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v30 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 -; SI-NEXT: v_alignbit_b32 v26, v27, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v22, v23, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v18, v19, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v30, v31, v15, 16 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v10 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v51 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshr_b64 v[32:33], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 -; SI-NEXT: v_lshr_b64 v[33:34], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[29:30], 16 -; SI-NEXT: v_alignbit_b32 v28, v40, v41, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[54:55], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v26, v54 +; SI-NEXT: v_mov_b32_e32 v22, v52 +; SI-NEXT: v_mov_b32_e32 v18, v50 +; SI-NEXT: v_mov_b32_e32 v14, v48 +; SI-NEXT: v_mov_b32_e32 v10, v37 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; SI-NEXT: v_mov_b32_e32 v6, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 +; SI-NEXT: v_mov_b32_e32 v2, v33 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 ; SI-NEXT: .LBB95_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -62170,49 +62366,49 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v1, v33 -; SI-NEXT: v_mov_b32_e32 v5, v34 -; SI-NEXT: v_mov_b32_e32 v9, v35 -; SI-NEXT: v_mov_b32_e32 v13, v36 -; SI-NEXT: v_mov_b32_e32 v17, v37 -; SI-NEXT: v_mov_b32_e32 v21, v38 -; SI-NEXT: v_mov_b32_e32 v25, v50 -; SI-NEXT: v_mov_b32_e32 v29, v48 +; SI-NEXT: v_mov_b32_e32 v2, v33 +; SI-NEXT: v_mov_b32_e32 v6, v35 +; SI-NEXT: v_mov_b32_e32 v10, v37 +; SI-NEXT: v_mov_b32_e32 v14, v48 +; SI-NEXT: v_mov_b32_e32 v18, v50 +; SI-NEXT: v_mov_b32_e32 v22, v52 +; SI-NEXT: v_mov_b32_e32 v26, v54 +; SI-NEXT: v_mov_b32_e32 v30, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_branch .LBB95_2 ; ; VI-LABEL: bitcast_v32bf16_to_v32i16_scalar: @@ -62231,295 +62427,302 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB95_4 ; VI-NEXT: .LBB95_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v1 -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_lshl_b32 s5, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s5, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s5, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s5, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_lshl_b32 s5, s31, 16 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s5, v1 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: s_and_b32 s5, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s5, v1 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_alignbit_b32 v14, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_alignbit_b32 v15, v5, v4, 16 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v16 ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v13, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v12, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v16 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v16 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v11, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v4, 16, 1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v4 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v10, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s24, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v9, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s22, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v8, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s20, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s18, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v5, s7, v16 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v5 +; VI-NEXT: s_lshl_b32 s6, s16, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; VI-NEXT: v_bfe_u32 v0, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s17, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v3, s5, v16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v5, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_and_b32 s5, s19, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc +; VI-NEXT: v_add_f32_e32 v3, s5, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v1 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v2, v16, v2, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v1 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_f32_e32 v3, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: s_and_b32 s5, s21, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_f32_e32 v5, s5, v16 +; VI-NEXT: v_mov_b32_e32 v1, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v7, v9, vcc +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; VI-NEXT: v_bfe_u32 v9, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: s_and_b32 s5, s23, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_add_f32_e32 v7, s5, v16 +; VI-NEXT: v_mov_b32_e32 v3, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v9, v11, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_add_f32_e32 v7, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; VI-NEXT: v_bfe_u32 v11, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v7 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: s_and_b32 s5, s25, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v9, s5, v16 +; VI-NEXT: v_mov_b32_e32 v5, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_add_f32_e32 v9, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v13, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v9 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: s_and_b32 s5, s27, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_add_f32_e32 v11, s5, v16 +; VI-NEXT: v_mov_b32_e32 v7, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v13, v15, vcc +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_add_f32_e32 v11, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc +; VI-NEXT: v_bfe_u32 v15, v11, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: s_and_b32 s5, s29, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v9, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_add_f32_e32 v13, s5, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_add_f32_e32 v17, s4, v1 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 -; VI-NEXT: v_alignbit_b32 v0, v16, v0, 16 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v16 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_mov_b32_e32 v13, v17 ; VI-NEXT: s_branch .LBB95_5 ; VI-NEXT: .LBB95_3: ; VI-NEXT: s_branch .LBB95_2 @@ -74865,295 +75068,302 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB103_4 ; VI-NEXT: .LBB103_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v1 -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_lshl_b32 s5, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s5, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s5, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s5, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_lshl_b32 s5, s31, 16 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s5, v1 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: s_and_b32 s5, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s5, v1 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_alignbit_b32 v14, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_alignbit_b32 v15, v5, v4, 16 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v16 ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v13, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v12, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v16 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v16 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v11, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v4, 16, 1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v4 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v10, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s24, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v9, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s22, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v8, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s20, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s18, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v5, s7, v16 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v5 +; VI-NEXT: s_lshl_b32 s6, s16, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; VI-NEXT: v_bfe_u32 v0, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s17, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v3, s5, v16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v5, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_and_b32 s5, s19, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc +; VI-NEXT: v_add_f32_e32 v3, s5, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v1 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v2, v16, v2, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v1 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_f32_e32 v3, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: s_and_b32 s5, s21, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_f32_e32 v5, s5, v16 +; VI-NEXT: v_mov_b32_e32 v1, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v7, v9, vcc +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; VI-NEXT: v_bfe_u32 v9, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: s_and_b32 s5, s23, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_add_f32_e32 v7, s5, v16 +; VI-NEXT: v_mov_b32_e32 v3, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v9, v11, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_add_f32_e32 v7, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; VI-NEXT: v_bfe_u32 v11, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v7 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: s_and_b32 s5, s25, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v9, s5, v16 +; VI-NEXT: v_mov_b32_e32 v5, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_add_f32_e32 v9, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v13, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v9 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: s_and_b32 s5, s27, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_add_f32_e32 v11, s5, v16 +; VI-NEXT: v_mov_b32_e32 v7, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v13, v15, vcc +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_add_f32_e32 v11, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc +; VI-NEXT: v_bfe_u32 v15, v11, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: s_and_b32 s5, s29, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v9, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_add_f32_e32 v13, s5, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_add_f32_e32 v17, s4, v1 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 -; VI-NEXT: v_alignbit_b32 v0, v16, v0, 16 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v16 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_mov_b32_e32 v13, v17 ; VI-NEXT: s_branch .LBB103_5 ; VI-NEXT: .LBB103_3: ; VI-NEXT: s_branch .LBB103_2 @@ -87519,1321 +87729,1481 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-LABEL: bitcast_v32bf16_to_v64i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v40, s30, 0 +; SI-NEXT: v_writelane_b32 v40, s31, 1 +; SI-NEXT: v_writelane_b32 v40, s34, 2 +; SI-NEXT: v_writelane_b32 v40, s35, 3 +; SI-NEXT: v_writelane_b32 v40, s36, 4 +; SI-NEXT: v_writelane_b32 v40, s37, 5 +; SI-NEXT: v_writelane_b32 v40, s38, 6 +; SI-NEXT: v_writelane_b32 v40, s39, 7 +; SI-NEXT: v_writelane_b32 v40, s48, 8 +; SI-NEXT: v_writelane_b32 v40, s49, 9 +; SI-NEXT: v_writelane_b32 v40, s50, 10 +; SI-NEXT: v_writelane_b32 v40, s51, 11 +; SI-NEXT: v_writelane_b32 v40, s52, 12 +; SI-NEXT: v_writelane_b32 v40, s53, 13 +; SI-NEXT: v_writelane_b32 v40, s54, 14 +; SI-NEXT: v_writelane_b32 v40, s55, 15 +; SI-NEXT: v_writelane_b32 v40, s64, 16 +; SI-NEXT: v_writelane_b32 v40, s65, 17 +; SI-NEXT: v_writelane_b32 v40, s66, 18 +; SI-NEXT: v_writelane_b32 v40, s67, 19 +; SI-NEXT: v_writelane_b32 v40, s68, 20 +; SI-NEXT: v_writelane_b32 v40, s69, 21 +; SI-NEXT: v_writelane_b32 v40, s70, 22 +; SI-NEXT: v_writelane_b32 v40, s71, 23 +; SI-NEXT: v_writelane_b32 v40, s80, 24 +; SI-NEXT: v_writelane_b32 v40, s81, 25 +; SI-NEXT: v_writelane_b32 v40, s82, 26 +; SI-NEXT: v_writelane_b32 v40, s83, 27 +; SI-NEXT: v_writelane_b32 v40, s84, 28 +; SI-NEXT: v_writelane_b32 v40, s85, 29 +; SI-NEXT: v_writelane_b32 v40, s86, 30 +; SI-NEXT: v_writelane_b32 v40, s87, 31 +; SI-NEXT: v_writelane_b32 v40, s96, 32 +; SI-NEXT: v_writelane_b32 v40, s97, 33 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_writelane_b32 v40, s98, 34 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mul_f32_e64 v19, 1.0, s17 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v9 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s28 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s28 +; SI-NEXT: v_writelane_b32 v40, s99, 35 +; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; SI-NEXT: v_alignbit_b32 v23, v1, v3, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_alignbit_b32 v20, v1, v6, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_alignbit_b32 v17, v1, v38, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53 -; SI-NEXT: v_alignbit_b32 v14, v1, v55, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v50 -; SI-NEXT: v_alignbit_b32 v11, v1, v52, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v1, v46, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 -; SI-NEXT: v_alignbit_b32 v21, v19, v4, 16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v35 -; SI-NEXT: v_alignbit_b32 v4, v1, v25, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; SI-NEXT: v_alignbit_b32 v18, v16, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v40 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v57 -; SI-NEXT: v_alignbit_b32 v3, v1, v37, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; SI-NEXT: v_alignbit_b32 v24, v22, v2, 16 -; SI-NEXT: v_alignbit_b32 v15, v13, v27, 16 -; SI-NEXT: v_alignbit_b32 v12, v10, v49, 16 -; SI-NEXT: v_alignbit_b32 v9, v7, v43, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v60, 16 -; SI-NEXT: v_alignbit_b32 v2, v1, v34, 16 -; SI-NEXT: v_readfirstlane_b32 s8, v23 -; SI-NEXT: v_readfirstlane_b32 s9, v24 -; SI-NEXT: v_readfirstlane_b32 s14, v20 -; SI-NEXT: v_readfirstlane_b32 s15, v21 -; SI-NEXT: v_readfirstlane_b32 s20, v17 -; SI-NEXT: v_readfirstlane_b32 s21, v18 -; SI-NEXT: v_readfirstlane_b32 s26, v14 -; SI-NEXT: v_readfirstlane_b32 s27, v15 -; SI-NEXT: v_readfirstlane_b32 s42, v11 -; SI-NEXT: v_readfirstlane_b32 s43, v12 -; SI-NEXT: v_readfirstlane_b32 s56, v8 -; SI-NEXT: v_readfirstlane_b32 s57, v9 -; SI-NEXT: v_readfirstlane_b32 s62, v4 -; SI-NEXT: v_readfirstlane_b32 s63, v5 -; SI-NEXT: v_readfirstlane_b32 s76, v3 -; SI-NEXT: v_readfirstlane_b32 s77, v2 -; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[8:9], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 -; SI-NEXT: s_lshr_b64 s[14:15], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[18:19], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[20:21], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[28:29], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[26:27], s[42:43], 24 -; SI-NEXT: s_lshr_b64 s[40:41], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[42:43], 8 -; SI-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 -; SI-NEXT: s_lshr_b64 s[46:47], s[56:57], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[56:57], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[62:63], 24 -; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[62:63], 8 -; SI-NEXT: s_lshr_b64 s[62:63], s[76:77], 24 -; SI-NEXT: s_lshr_b64 s[74:75], s[76:77], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[76:77], 8 -; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v24 -; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v29 -; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v21 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v35 -; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v18 -; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v33 -; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v15 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v39 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v40 -; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v9 -; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v57 -; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v5 -; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v32 -; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; SI-NEXT: v_readfirstlane_b32 s4, v19 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_lshr_b32 s73, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s72, v2 +; SI-NEXT: s_lshr_b64 s[76:77], s[72:73], 16 +; SI-NEXT: s_mov_b32 s75, s76 +; SI-NEXT: s_lshr_b64 s[4:5], s[74:75], 24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v41, s4, 0 +; SI-NEXT: v_writelane_b32 v41, s5, 1 +; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: s_lshr_b32 s59, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: s_lshr_b32 s45, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v20 +; SI-NEXT: s_lshr_b32 s25, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v24 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v25 +; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v22 +; SI-NEXT: s_lshr_b32 s41, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v28 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v29 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v26 +; SI-NEXT: s_lshr_b32 s19, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v30 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v18 +; SI-NEXT: v_readfirstlane_b32 s58, v5 +; SI-NEXT: v_readfirstlane_b32 s44, v9 +; SI-NEXT: v_readfirstlane_b32 s24, v21 +; SI-NEXT: v_readfirstlane_b32 s40, v23 +; SI-NEXT: v_readfirstlane_b32 s18, v27 +; SI-NEXT: v_readfirstlane_b32 s10, v31 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: s_lshr_b64 s[62:63], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_mov_b32 s61, s62 +; SI-NEXT: s_mov_b32 s47, s56 +; SI-NEXT: s_mov_b32 s27, s42 +; SI-NEXT: s_mov_b32 s17, s22 +; SI-NEXT: s_mov_b32 s21, s28 +; SI-NEXT: s_mov_b32 s13, s14 +; SI-NEXT: s_mov_b32 s7, s8 +; SI-NEXT: s_lshr_b64 s[88:89], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[74:75], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[60:61], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[60:61], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[46:47], 8 +; SI-NEXT: s_lshr_b64 s[48:49], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[50:51], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[54:55], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[64:65], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[68:69], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v4 +; SI-NEXT: s_lshr_b32 s24, s76, 8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; SI-NEXT: s_lshr_b32 s23, s62, 8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v20 +; SI-NEXT: s_lshr_b32 s18, s56, 8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v22 +; SI-NEXT: s_lshr_b32 s17, s42, 8 +; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v26 +; SI-NEXT: s_lshr_b32 s15, s22, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v30 +; SI-NEXT: s_lshr_b32 s10, s28, 8 +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v18 +; SI-NEXT: s_lshr_b32 s9, s14, 8 +; SI-NEXT: s_lshr_b32 s4, s8, 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[86:87], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[96:97], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[98:99], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[80:81], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 8 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v38 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v14 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v52 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v16 +; SI-NEXT: v_readfirstlane_b32 s10, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v31 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v15 +; SI-NEXT: s_lshr_b64 s[12:13], s[10:11], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v30 +; SI-NEXT: v_readfirstlane_b32 s10, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v28 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_readfirstlane_b32 s16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v27 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v26 +; SI-NEXT: v_readfirstlane_b32 s18, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v25 +; SI-NEXT: s_lshr_b32 s17, s4, 16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24 +; SI-NEXT: s_lshr_b64 s[20:21], s[16:17], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v16 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_readfirstlane_b32 s16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v23 +; SI-NEXT: s_lshr_b32 s19, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: s_lshr_b32 s17, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: s_lshr_b32 s41, s4, 16 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v21 +; SI-NEXT: v_readfirstlane_b32 s24, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: s_lshr_b32 s25, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: s_lshr_b64 s[26:27], s[24:25], 16 +; SI-NEXT: s_lshr_b32 s25, s4, 16 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: s_lshr_b32 s45, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s44, v11 +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 16 +; SI-NEXT: s_lshr_b32 s45, s4, 16 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25 +; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: s_lshr_b32 s59, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s58, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v8, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 -; SI-NEXT: v_alignbit_b32 v11, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: s_lshr_b64 s[60:61], s[58:59], 16 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v7 -; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_readfirstlane_b32 s58, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v19 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v36 -; SI-NEXT: v_alignbit_b32 v2, v1, v2, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v7, v9, 16 -; SI-NEXT: v_alignbit_b32 v12, v10, v12, 16 -; SI-NEXT: v_readfirstlane_b32 s76, v3 -; SI-NEXT: v_readfirstlane_b32 s77, v2 -; SI-NEXT: v_readfirstlane_b32 s62, v4 -; SI-NEXT: v_readfirstlane_b32 s63, v5 -; SI-NEXT: v_readfirstlane_b32 s56, v8 -; SI-NEXT: v_readfirstlane_b32 s57, v9 -; SI-NEXT: v_readfirstlane_b32 s42, v11 -; SI-NEXT: v_readfirstlane_b32 s43, v12 -; SI-NEXT: v_readfirstlane_b32 s26, v14 -; SI-NEXT: s_lshr_b64 s[40:41], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[42:43], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[56:57], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[56:57], 8 -; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[62:63], 8 -; SI-NEXT: s_lshr_b64 s[74:75], s[76:77], 16 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v9 -; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v5 -; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v36 -; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v34 -; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v25 -; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v32 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_alignbit_b32 v17, v17, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s20, v17 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_alignbit_b32 v20, v20, v19, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v23, v23, v22, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v13 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v31 -; SI-NEXT: v_alignbit_b32 v15, v13, v15, 16 -; SI-NEXT: v_alignbit_b32 v18, v16, v18, 16 -; SI-NEXT: v_readfirstlane_b32 s27, v15 -; SI-NEXT: v_readfirstlane_b32 s21, v18 -; SI-NEXT: v_readfirstlane_b32 s14, v20 -; SI-NEXT: v_readfirstlane_b32 s8, v23 -; SI-NEXT: s_lshr_b64 s[18:19], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[28:29], s[26:27], 8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v18 -; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v15 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 -; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v33 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v19 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; SI-NEXT: v_alignbit_b32 v21, v19, v21, 16 -; SI-NEXT: v_alignbit_b32 v24, v22, v24, 16 -; SI-NEXT: v_readfirstlane_b32 s15, v21 -; SI-NEXT: v_readfirstlane_b32 s9, v24 -; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[8:9], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 -; SI-NEXT: s_lshr_b64 s[14:15], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[20:21], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[26:27], s[42:43], 24 -; SI-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 -; SI-NEXT: s_lshr_b64 s[56:57], s[62:63], 24 -; SI-NEXT: s_lshr_b64 s[62:63], s[76:77], 24 -; SI-NEXT: s_lshr_b64 s[76:77], s[76:77], 8 -; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v24 -; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v21 -; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v26 -; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_lshr_b32 s59, s4, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: s_lshr_b32 s73, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s72, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: s_lshr_b64 s[74:75], s[72:73], 16 +; SI-NEXT: s_lshr_b32 s73, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s72, v2 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: s_lshr_b64 s[76:77], s[72:73], 16 +; SI-NEXT: v_readfirstlane_b32 s40, v18 +; SI-NEXT: v_readfirstlane_b32 s24, v13 +; SI-NEXT: v_readfirstlane_b32 s44, v9 +; SI-NEXT: s_mov_b32 s75, s76 +; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[16:17], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[74:75], 24 +; SI-NEXT: s_mov_b32 s7, s8 +; SI-NEXT: s_mov_b32 s13, s14 +; SI-NEXT: s_mov_b32 s21, s28 +; SI-NEXT: s_mov_b32 s17, s22 +; SI-NEXT: s_mov_b32 s27, s42 +; SI-NEXT: s_mov_b32 s47, s56 +; SI-NEXT: s_mov_b32 s61, s62 +; SI-NEXT: v_writelane_b32 v41, s78, 0 +; SI-NEXT: v_writelane_b32 v41, s79, 1 +; SI-NEXT: s_lshr_b64 s[88:89], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[74:75], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[60:61], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[60:61], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[46:47], 8 +; SI-NEXT: s_lshr_b64 s[48:49], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[50:51], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[54:55], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[64:65], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[68:69], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 16 +; SI-NEXT: s_lshr_b32 s24, s76, 8 +; SI-NEXT: s_lshr_b32 s23, s62, 8 +; SI-NEXT: s_lshr_b32 s18, s56, 8 +; SI-NEXT: s_lshr_b32 s17, s42, 8 +; SI-NEXT: s_lshr_b32 s15, s22, 8 +; SI-NEXT: s_lshr_b32 s10, s28, 8 +; SI-NEXT: s_lshr_b32 s9, s14, 8 +; SI-NEXT: s_lshr_b32 s4, s8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v14 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[86:87], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[96:97], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[98:99], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[80:81], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 8 ; SI-NEXT: .LBB109_3: ; %end -; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 -; SI-NEXT: s_lshl_b32 s5, s10, 8 -; SI-NEXT: v_or_b32_e32 v23, s5, v23 -; SI-NEXT: s_and_b32 s5, s6, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s4, s4, 24 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_or_b32_e32 v23, s4, v23 -; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s7, s74, 0xff +; SI-NEXT: s_lshl_b32 s13, s92, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s88, 0xff +; SI-NEXT: v_readlane_b32 s74, v41, 0 +; SI-NEXT: s_lshl_b32 s21, s74, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_or_b32 s13, s21, s13 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_and_b32 s7, s76, 0xff +; SI-NEXT: s_lshl_b32 s13, s24, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s73, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v48 +; SI-NEXT: v_or_b32_e32 v2, s13, v2 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v2, s7, v2 +; SI-NEXT: s_and_b32 s7, s60, 0xff +; SI-NEXT: s_lshl_b32 s13, s30, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s94, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s21, s90, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s13, s21, s13 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v27 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: s_lshl_b32 s4, s16, 8 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v30 -; SI-NEXT: v_or_b32_e32 v20, s4, v20 -; SI-NEXT: s_and_b32 s4, s12, 0xff -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s8, 24 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 4, v0 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v20, s4, v20 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s62, 0xff +; SI-NEXT: s_lshl_b32 s13, s23, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s59, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v39 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s13, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s7, s46, 0xff +; SI-NEXT: s_lshl_b32 s13, s38, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s36, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s21, s34, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s13, s21, s13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v58 -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: s_lshl_b32 s4, s22, 8 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v62 -; SI-NEXT: v_or_b32_e32 v17, s4, v17 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s14, 24 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v17, s4, v17 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s56, 0xff +; SI-NEXT: s_lshl_b32 s13, s18, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s45, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v38 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s13, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s13, s52, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s50, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s18, s48, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s13, s18, s13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v28 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: s_lshl_b32 s4, s28, 8 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v31 -; SI-NEXT: v_or_b32_e32 v14, s4, v14 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s20, 24 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, 20, v0 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v14, s4, v14 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s42, 0xff +; SI-NEXT: s_lshl_b32 s13, s17, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v37 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s13, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s7, s16, 0xff +; SI-NEXT: s_lshl_b32 s13, s66, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s64, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s16, s54, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s13, s16, s13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v59 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: s_lshl_b32 s4, s44, 8 -; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v63 -; SI-NEXT: v_or_b32_e32 v11, s4, v11 -; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s26, 24 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_lshl_b32 s13, s15, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s41, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v34 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s13, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s7, s20, 0xff +; SI-NEXT: s_lshl_b32 s13, s78, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s70, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s15, s68, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s13, s15, s13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v47 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: s_lshl_b32 s4, s58, 8 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v56 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s46, 0xff -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s42, 24 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v41 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: s_lshl_b32 s4, s72, 8 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v45 -; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s60, 0xff -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s56, 24 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 44, v0 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: s_and_b32 s10, s19, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s10, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s7, s12, 0xff +; SI-NEXT: s_lshl_b32 s10, s98, 8 +; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: s_and_b32 s10, s96, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s12, s86, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s10, s12, s10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s14, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v54 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: s_lshl_b32 s4, s76, 8 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v42 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s74, 0xff -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s62, 24 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s11, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v15 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s9, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s7, s84, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s82, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s9, s80, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s9, s7 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v48 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s8, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 8 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s4, s6, s4 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v14 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: v_readlane_b32 s75, v41, 1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s99, v40, 35 +; SI-NEXT: v_readlane_b32 s98, v40, 34 +; SI-NEXT: v_readlane_b32 s97, v40, 33 +; SI-NEXT: v_readlane_b32 s96, v40, 32 +; SI-NEXT: v_readlane_b32 s87, v40, 31 +; SI-NEXT: v_readlane_b32 s86, v40, 30 +; SI-NEXT: v_readlane_b32 s85, v40, 29 +; SI-NEXT: v_readlane_b32 s84, v40, 28 +; SI-NEXT: v_readlane_b32 s83, v40, 27 +; SI-NEXT: v_readlane_b32 s82, v40, 26 +; SI-NEXT: v_readlane_b32 s81, v40, 25 +; SI-NEXT: v_readlane_b32 s80, v40, 24 +; SI-NEXT: v_readlane_b32 s71, v40, 23 +; SI-NEXT: v_readlane_b32 s70, v40, 22 +; SI-NEXT: v_readlane_b32 s69, v40, 21 +; SI-NEXT: v_readlane_b32 s68, v40, 20 +; SI-NEXT: v_readlane_b32 s67, v40, 19 +; SI-NEXT: v_readlane_b32 s66, v40, 18 +; SI-NEXT: v_readlane_b32 s65, v40, 17 +; SI-NEXT: v_readlane_b32 s64, v40, 16 +; SI-NEXT: v_readlane_b32 s55, v40, 15 +; SI-NEXT: v_readlane_b32 s54, v40, 14 +; SI-NEXT: v_readlane_b32 s53, v40, 13 +; SI-NEXT: v_readlane_b32 s52, v40, 12 +; SI-NEXT: v_readlane_b32 s51, v40, 11 +; SI-NEXT: v_readlane_b32 s50, v40, 10 +; SI-NEXT: v_readlane_b32 s49, v40, 9 +; SI-NEXT: v_readlane_b32 s48, v40, 8 +; SI-NEXT: v_readlane_b32 s39, v40, 7 +; SI-NEXT: v_readlane_b32 s38, v40, 6 +; SI-NEXT: v_readlane_b32 s37, v40, 5 +; SI-NEXT: v_readlane_b32 s36, v40, 4 +; SI-NEXT: v_readlane_b32 s35, v40, 3 +; SI-NEXT: v_readlane_b32 s34, v40, 2 +; SI-NEXT: v_readlane_b32 s31, v40, 1 +; SI-NEXT: v_readlane_b32 s30, v40, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v41, s4, 0 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: v_writelane_b32 v41, s5, 1 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr5 ; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v32bf16_to_v64i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v63, s30, 0 -; VI-NEXT: v_writelane_b32 v63, s31, 1 -; VI-NEXT: v_writelane_b32 v63, s34, 2 -; VI-NEXT: v_writelane_b32 v63, s35, 3 -; VI-NEXT: v_writelane_b32 v63, s36, 4 -; VI-NEXT: v_writelane_b32 v63, s37, 5 -; VI-NEXT: v_writelane_b32 v63, s38, 6 -; VI-NEXT: v_writelane_b32 v63, s39, 7 -; VI-NEXT: v_writelane_b32 v63, s48, 8 -; VI-NEXT: v_writelane_b32 v63, s49, 9 -; VI-NEXT: v_writelane_b32 v63, s50, 10 -; VI-NEXT: v_writelane_b32 v63, s51, 11 -; VI-NEXT: v_writelane_b32 v63, s52, 12 -; VI-NEXT: v_writelane_b32 v63, s53, 13 -; VI-NEXT: v_writelane_b32 v63, s54, 14 -; VI-NEXT: v_writelane_b32 v63, s55, 15 -; VI-NEXT: v_writelane_b32 v63, s64, 16 -; VI-NEXT: v_writelane_b32 v63, s65, 17 -; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_writelane_b32 v4, s30, 0 +; VI-NEXT: v_writelane_b32 v4, s31, 1 +; VI-NEXT: v_writelane_b32 v4, s34, 2 +; VI-NEXT: v_writelane_b32 v4, s35, 3 +; VI-NEXT: v_writelane_b32 v4, s36, 4 +; VI-NEXT: v_writelane_b32 v4, s37, 5 +; VI-NEXT: v_writelane_b32 v4, s38, 6 +; VI-NEXT: v_writelane_b32 v4, s39, 7 +; VI-NEXT: v_writelane_b32 v4, s48, 8 +; VI-NEXT: v_writelane_b32 v4, s49, 9 +; VI-NEXT: v_writelane_b32 v4, s50, 10 +; VI-NEXT: v_writelane_b32 v4, s51, 11 +; VI-NEXT: v_writelane_b32 v4, s52, 12 +; VI-NEXT: v_writelane_b32 v4, s53, 13 +; VI-NEXT: v_writelane_b32 v4, s54, 14 +; VI-NEXT: v_writelane_b32 v4, s55, 15 +; VI-NEXT: v_writelane_b32 v4, s64, 16 +; VI-NEXT: v_writelane_b32 v4, s65, 17 +; VI-NEXT: v_writelane_b32 v4, s66, 18 +; VI-NEXT: v_writelane_b32 v4, s67, 19 +; VI-NEXT: v_writelane_b32 v4, s68, 20 +; VI-NEXT: v_writelane_b32 v4, s69, 21 +; VI-NEXT: v_writelane_b32 v4, s70, 22 +; VI-NEXT: v_writelane_b32 v4, s71, 23 +; VI-NEXT: v_writelane_b32 v4, s80, 24 +; VI-NEXT: v_writelane_b32 v4, s81, 25 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_writelane_b32 v4, s82, 26 ; VI-NEXT: v_readfirstlane_b32 s4, v1 ; VI-NEXT: s_and_b64 s[6:7], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_cbranch_scc0 .LBB109_3 +; VI-NEXT: v_writelane_b32 v4, s83, 27 +; VI-NEXT: s_cbranch_scc0 .LBB109_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s56, s5, 24 -; VI-NEXT: s_lshr_b32 s57, s5, 16 -; VI-NEXT: s_lshr_b32 s59, s5, 8 -; VI-NEXT: s_lshr_b32 s58, s4, 16 -; VI-NEXT: s_lshr_b32 s60, s4, 8 -; VI-NEXT: s_lshr_b32 s61, s29, 24 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s72, s29, 8 -; VI-NEXT: s_lshr_b32 s63, s28, 16 -; VI-NEXT: s_lshr_b32 s73, s28, 8 -; VI-NEXT: s_lshr_b32 s74, s27, 24 -; VI-NEXT: s_lshr_b32 s75, s27, 16 -; VI-NEXT: s_lshr_b32 s77, s27, 8 -; VI-NEXT: s_lshr_b32 s76, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s26, 8 -; VI-NEXT: s_lshr_b32 s79, s25, 24 -; VI-NEXT: s_lshr_b32 s88, s25, 16 -; VI-NEXT: s_lshr_b32 s90, s25, 8 -; VI-NEXT: s_lshr_b32 s89, s24, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 8 -; VI-NEXT: s_lshr_b32 s30, s23, 24 -; VI-NEXT: s_lshr_b32 s31, s23, 16 -; VI-NEXT: s_lshr_b32 s35, s23, 8 -; VI-NEXT: s_lshr_b32 s34, s22, 16 -; VI-NEXT: s_lshr_b32 s36, s22, 8 -; VI-NEXT: s_lshr_b32 s37, s21, 24 -; VI-NEXT: s_lshr_b32 s38, s21, 16 -; VI-NEXT: s_lshr_b32 s48, s21, 8 -; VI-NEXT: s_lshr_b32 s39, s20, 16 -; VI-NEXT: s_lshr_b32 s49, s20, 8 -; VI-NEXT: s_lshr_b32 s50, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s53, s19, 8 -; VI-NEXT: s_lshr_b32 s52, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s55, s17, 24 -; VI-NEXT: s_lshr_b32 s64, s17, 16 -; VI-NEXT: s_lshr_b32 s66, s17, 8 -; VI-NEXT: s_lshr_b32 s65, s16, 16 -; VI-NEXT: s_lshr_b32 s67, s16, 8 -; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 -; VI-NEXT: s_cbranch_execnz .LBB109_4 +; VI-NEXT: s_lshr_b32 s7, s5, 24 +; VI-NEXT: s_lshr_b32 s9, s5, 16 +; VI-NEXT: s_lshr_b32 s11, s5, 8 +; VI-NEXT: s_lshr_b32 s13, s4, 16 +; VI-NEXT: s_lshr_b32 s15, s4, 8 +; VI-NEXT: s_lshr_b32 s41, s29, 24 +; VI-NEXT: s_lshr_b32 s47, s29, 16 +; VI-NEXT: s_lshr_b32 s57, s29, 8 +; VI-NEXT: s_lshr_b32 s88, s28, 16 +; VI-NEXT: s_lshr_b32 s89, s28, 8 +; VI-NEXT: s_lshr_b32 s90, s27, 24 +; VI-NEXT: s_lshr_b32 s91, s27, 16 +; VI-NEXT: s_lshr_b32 s30, s27, 8 +; VI-NEXT: s_lshr_b32 s31, s26, 16 +; VI-NEXT: s_lshr_b32 s34, s26, 8 +; VI-NEXT: s_lshr_b32 s35, s25, 24 +; VI-NEXT: s_lshr_b32 s36, s25, 16 +; VI-NEXT: s_lshr_b32 s37, s25, 8 +; VI-NEXT: s_lshr_b32 s38, s24, 16 +; VI-NEXT: s_lshr_b32 s39, s24, 8 +; VI-NEXT: s_lshr_b32 s48, s23, 24 +; VI-NEXT: s_lshr_b32 s49, s23, 16 +; VI-NEXT: s_lshr_b32 s50, s23, 8 +; VI-NEXT: s_lshr_b32 s51, s22, 16 +; VI-NEXT: s_lshr_b32 s52, s22, 8 +; VI-NEXT: s_lshr_b32 s53, s21, 24 +; VI-NEXT: s_lshr_b32 s54, s21, 16 +; VI-NEXT: s_lshr_b32 s55, s21, 8 +; VI-NEXT: s_lshr_b32 s64, s20, 16 +; VI-NEXT: s_lshr_b32 s65, s20, 8 +; VI-NEXT: s_lshr_b32 s66, s19, 24 +; VI-NEXT: s_lshr_b32 s67, s19, 16 +; VI-NEXT: s_lshr_b32 s68, s19, 8 +; VI-NEXT: s_lshr_b32 s69, s18, 16 +; VI-NEXT: s_lshr_b32 s70, s18, 8 +; VI-NEXT: s_lshr_b32 s71, s17, 24 +; VI-NEXT: s_lshr_b32 s80, s17, 16 +; VI-NEXT: s_lshr_b32 s81, s17, 8 +; VI-NEXT: s_lshr_b32 s82, s16, 16 +; VI-NEXT: s_lshr_b32 s83, s16, 8 +; VI-NEXT: s_lshr_b64 s[42:43], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 +; VI-NEXT: s_mov_b32 s6, s17 +; VI-NEXT: s_mov_b32 s8, s19 +; VI-NEXT: s_mov_b32 s10, s21 +; VI-NEXT: s_mov_b32 s12, s23 +; VI-NEXT: s_mov_b32 s14, s25 +; VI-NEXT: s_mov_b32 s40, s27 +; VI-NEXT: s_mov_b32 s46, s29 +; VI-NEXT: s_mov_b32 s56, s5 +; VI-NEXT: s_cbranch_execnz .LBB109_3 ; VI-NEXT: .LBB109_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s6, s17, 16 -; VI-NEXT: v_mov_b32_e32 v15, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s6, v15 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s6, v15 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s6, s16, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s6, v15 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s6, v15 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_lshl_b32 s6, s19, 16 -; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; VI-NEXT: v_add_f32_e32 v3, s6, v15 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s6, v15 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s6, s18, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v3, 16 -; VI-NEXT: v_add_f32_e32 v3, s6, v15 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s6, v15 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: s_lshl_b32 s6, s21, 16 -; VI-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; VI-NEXT: v_add_f32_e32 v5, s6, v15 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s6, v15 -; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: s_lshl_b32 s6, s20, 16 -; VI-NEXT: v_alignbit_b32 v6, v6, v5, 16 -; VI-NEXT: v_add_f32_e32 v5, s6, v15 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; VI-NEXT: v_add_f32_e32 v7, s6, v15 -; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: s_lshl_b32 s6, s23, 16 -; VI-NEXT: v_alignbit_b32 v5, v7, v5, 16 -; VI-NEXT: v_add_f32_e32 v7, s6, v15 -; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s6, v15 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: s_lshl_b32 s6, s22, 16 -; VI-NEXT: v_alignbit_b32 v8, v8, v7, 16 -; VI-NEXT: v_add_f32_e32 v7, s6, v15 -; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; VI-NEXT: v_add_f32_e32 v9, s6, v15 -; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: s_lshl_b32 s6, s25, 16 -; VI-NEXT: v_alignbit_b32 v7, v9, v7, 16 -; VI-NEXT: v_add_f32_e32 v9, s6, v15 -; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc -; VI-NEXT: v_add_f32_e32 v10, s6, v15 -; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: s_lshl_b32 s6, s24, 16 -; VI-NEXT: v_alignbit_b32 v10, v10, v9, 16 -; VI-NEXT: v_add_f32_e32 v9, s6, v15 -; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc -; VI-NEXT: v_add_f32_e32 v11, s6, v15 -; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: s_lshl_b32 s6, s27, 16 -; VI-NEXT: v_alignbit_b32 v9, v11, v9, 16 -; VI-NEXT: v_add_f32_e32 v11, s6, v15 -; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc -; VI-NEXT: v_add_f32_e32 v12, s6, v15 -; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: s_lshl_b32 s6, s26, 16 -; VI-NEXT: v_alignbit_b32 v12, v12, v11, 16 -; VI-NEXT: v_add_f32_e32 v11, s6, v15 -; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s6, v15 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: s_lshl_b32 s6, s29, 16 -; VI-NEXT: v_alignbit_b32 v11, v13, v11, 16 -; VI-NEXT: v_add_f32_e32 v13, s6, v15 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s6, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v16, vcc -; VI-NEXT: v_add_f32_e32 v14, s6, v15 -; VI-NEXT: v_bfe_u32 v16, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v14 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v14 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: s_lshl_b32 s6, s28, 16 -; VI-NEXT: v_alignbit_b32 v14, v14, v13, 16 -; VI-NEXT: v_add_f32_e32 v13, s6, v15 -; VI-NEXT: v_bfe_u32 v16, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v13 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s6, v15 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: s_lshl_b32 s6, s5, 16 -; VI-NEXT: v_alignbit_b32 v13, v16, v13, 16 -; VI-NEXT: v_add_f32_e32 v16, s6, v15 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s9, s8 +; VI-NEXT: s_and_b32 s7, s17, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 +; VI-NEXT: s_add_i32 s8, s8, s7 +; VI-NEXT: s_add_i32 s10, s8, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s10 +; VI-NEXT: s_lshr_b32 s7, s7, 16 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; VI-NEXT: s_lshl_b32 s7, s16, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 +; VI-NEXT: s_add_i32 s8, s8, s7 +; VI-NEXT: s_add_i32 s10, s8, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s8, s7, s10 +; VI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s9, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s19, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_lshr_b64 s[16:17], s[8:9], 16 +; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 +; VI-NEXT: s_add_i32 s8, s8, s7 +; VI-NEXT: s_add_i32 s10, s8, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s8, s7, s10 +; VI-NEXT: s_and_b32 s7, s19, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s9, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s18, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s10, s7, s9 +; VI-NEXT: s_and_b32 s7, s18, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s11, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s21, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[18:19], s[10:11], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s10, s7, s9 +; VI-NEXT: s_and_b32 s7, s21, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s11, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s20, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s12, s7, s9 +; VI-NEXT: s_and_b32 s7, s20, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s13, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s23, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s12, s7, s9 +; VI-NEXT: s_and_b32 s7, s23, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s13, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s22, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[12:13], s[12:13], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s22, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[22:23], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s25, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[22:23], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s25, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[40:41], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s24, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[14:15], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[40:41], vcc, exec +; VI-NEXT: s_cselect_b32 s40, s7, s9 +; VI-NEXT: s_and_b32 s7, s24, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[24:25], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s41, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s27, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[24:25], s[40:41], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[40:41], vcc, exec +; VI-NEXT: s_cselect_b32 s40, s7, s9 +; VI-NEXT: s_and_b32 s7, s27, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s41, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s26, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[40:41], s[40:41], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s42, s7, s9 +; VI-NEXT: s_and_b32 s7, s26, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[26:27], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s43, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s29, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[26:27], s[42:43], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s42, s7, s9 +; VI-NEXT: s_and_b32 s7, s29, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[44:45], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s43, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s28, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[46:47], s[42:43], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s42, s7, s9 +; VI-NEXT: s_and_b32 s7, s28, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[28:29], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s43, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s5, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[28:29], s[42:43], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s42, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_add_f32_e32 v17, s5, v15 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_f32_e32 v2, s5, v1 +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: s_bfe_u32 s7, s5, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s5 +; VI-NEXT: s_addk_i32 s7, 0x7fff +; VI-NEXT: s_bitset1_b32 s5, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[44:45], vcc, exec +; VI-NEXT: s_cselect_b32 s5, s5, s7 +; VI-NEXT: s_lshr_b32 s43, s5, 16 ; VI-NEXT: s_lshl_b32 s5, s4, 16 -; VI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; VI-NEXT: v_add_f32_e32 v17, s5, v15 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_add_f32_e32 v2, s5, v1 +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: s_bfe_u32 s7, s5, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s5 +; VI-NEXT: s_lshr_b64 s[56:57], s[42:43], 16 +; VI-NEXT: s_addk_i32 s7, 0x7fff +; VI-NEXT: s_bitset1_b32 s5, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s42, s5, s7 ; VI-NEXT: s_and_b32 s4, s4, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_add_f32_e32 v15, s4, v15 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v17, 16 -; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] -; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[13:14] -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1 -; VI-NEXT: s_branch .LBB109_5 -; VI-NEXT: .LBB109_3: -; VI-NEXT: ; implicit-def: $sgpr67 -; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_bfe_u32 s5, s4, 0x10010 +; VI-NEXT: s_add_i32 s5, s5, s4 +; VI-NEXT: s_add_i32 s7, s5, 0x7fff +; VI-NEXT: s_or_b32 s9, s4, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cselect_b32 s4, s9, s7 +; VI-NEXT: s_lshr_b32 s43, s4, 16 +; VI-NEXT: s_lshr_b64 s[4:5], s[42:43], 16 +; VI-NEXT: s_mov_b32 s17, s6 +; VI-NEXT: s_mov_b32 s19, s8 +; VI-NEXT: s_mov_b32 s21, s10 +; VI-NEXT: s_mov_b32 s23, s12 +; VI-NEXT: s_mov_b32 s25, s14 +; VI-NEXT: s_mov_b32 s27, s40 +; VI-NEXT: s_mov_b32 s29, s46 +; VI-NEXT: s_mov_b32 s5, s56 +; VI-NEXT: s_lshr_b64 s[42:43], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[28:29], 24 +; VI-NEXT: s_lshr_b32 s7, s56, 24 +; VI-NEXT: s_lshr_b32 s9, s56, 16 +; VI-NEXT: s_lshr_b32 s11, s56, 8 +; VI-NEXT: s_lshr_b32 s13, s4, 16 +; VI-NEXT: s_lshr_b32 s15, s4, 8 +; VI-NEXT: s_lshr_b32 s41, s46, 24 +; VI-NEXT: s_lshr_b32 s47, s46, 16 +; VI-NEXT: s_lshr_b32 s57, s46, 8 +; VI-NEXT: s_lshr_b32 s88, s28, 16 +; VI-NEXT: s_lshr_b32 s89, s28, 8 +; VI-NEXT: s_lshr_b32 s90, s40, 24 +; VI-NEXT: s_lshr_b32 s91, s40, 16 +; VI-NEXT: s_lshr_b32 s30, s40, 8 +; VI-NEXT: s_lshr_b32 s31, s26, 16 +; VI-NEXT: s_lshr_b32 s34, s26, 8 +; VI-NEXT: s_lshr_b32 s35, s14, 24 +; VI-NEXT: s_lshr_b32 s36, s14, 16 +; VI-NEXT: s_lshr_b32 s37, s14, 8 +; VI-NEXT: s_lshr_b32 s38, s24, 16 +; VI-NEXT: s_lshr_b32 s39, s24, 8 +; VI-NEXT: s_lshr_b32 s48, s12, 24 +; VI-NEXT: s_lshr_b32 s49, s12, 16 +; VI-NEXT: s_lshr_b32 s50, s12, 8 +; VI-NEXT: s_lshr_b32 s51, s22, 16 +; VI-NEXT: s_lshr_b32 s52, s22, 8 +; VI-NEXT: s_lshr_b32 s53, s10, 24 +; VI-NEXT: s_lshr_b32 s54, s10, 16 +; VI-NEXT: s_lshr_b32 s55, s10, 8 +; VI-NEXT: s_lshr_b32 s64, s20, 16 +; VI-NEXT: s_lshr_b32 s65, s20, 8 +; VI-NEXT: s_lshr_b32 s66, s8, 24 +; VI-NEXT: s_lshr_b32 s67, s8, 16 +; VI-NEXT: s_lshr_b32 s68, s8, 8 +; VI-NEXT: s_lshr_b32 s69, s18, 16 +; VI-NEXT: s_lshr_b32 s70, s18, 8 +; VI-NEXT: s_lshr_b32 s71, s6, 24 +; VI-NEXT: s_lshr_b32 s80, s6, 16 +; VI-NEXT: s_lshr_b32 s81, s6, 8 +; VI-NEXT: s_lshr_b32 s82, s16, 16 +; VI-NEXT: s_lshr_b32 s83, s16, 8 +; VI-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 +; VI-NEXT: .LBB109_3: ; %end +; VI-NEXT: s_and_b32 s5, s16, 0xff +; VI-NEXT: s_lshl_b32 s16, s83, 8 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_lshl_b32 s16, s76, 8 +; VI-NEXT: s_and_b32 s17, s82, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s16, s16, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_and_b32 s5, s6, 0xff +; VI-NEXT: s_lshl_b32 s6, s81, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s80, 0xff +; VI-NEXT: s_lshl_b32 s16, s71, 8 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s70, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s69, 0xff +; VI-NEXT: s_lshl_b32 s16, s74, 8 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s8, 0xff +; VI-NEXT: s_lshl_b32 s6, s68, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s67, 0xff +; VI-NEXT: s_lshl_b32 s8, s66, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s65, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s64, 0xff +; VI-NEXT: s_lshl_b32 s8, s72, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s10, 0xff +; VI-NEXT: s_lshl_b32 s6, s55, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s54, 0xff +; VI-NEXT: s_lshl_b32 s8, s53, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff +; VI-NEXT: s_lshl_b32 s6, s52, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s51, 0xff +; VI-NEXT: s_lshl_b32 s8, s62, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s12, 0xff +; VI-NEXT: s_lshl_b32 s6, s50, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s49, 0xff +; VI-NEXT: s_lshl_b32 s8, s48, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s24, 0xff +; VI-NEXT: s_lshl_b32 s6, s39, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s38, 0xff +; VI-NEXT: s_lshl_b32 s8, s60, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s14, 0xff +; VI-NEXT: s_lshl_b32 s6, s37, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s36, 0xff +; VI-NEXT: s_lshl_b32 s8, s35, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s34, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s31, 0xff +; VI-NEXT: s_lshl_b32 s8, s58, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s40, 0xff +; VI-NEXT: s_lshl_b32 s6, s30, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s91, 0xff +; VI-NEXT: s_lshl_b32 s8, s90, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_lshl_b32 s6, s89, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s88, 0xff +; VI-NEXT: s_lshl_b32 s8, s44, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s46, 0xff +; VI-NEXT: s_lshl_b32 s6, s57, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s47, 0xff +; VI-NEXT: s_lshl_b32 s8, s41, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s5, s15, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s13, 0xff +; VI-NEXT: s_lshl_b32 s6, s42, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s56, 0xff +; VI-NEXT: s_lshl_b32 s5, s11, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s9, 0xff +; VI-NEXT: s_lshl_b32 s6, s7, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_readlane_b32 s83, v4, 27 +; VI-NEXT: v_readlane_b32 s82, v4, 26 +; VI-NEXT: v_readlane_b32 s81, v4, 25 +; VI-NEXT: v_readlane_b32 s80, v4, 24 +; VI-NEXT: v_readlane_b32 s71, v4, 23 +; VI-NEXT: v_readlane_b32 s70, v4, 22 +; VI-NEXT: v_readlane_b32 s69, v4, 21 +; VI-NEXT: v_readlane_b32 s68, v4, 20 +; VI-NEXT: v_readlane_b32 s67, v4, 19 +; VI-NEXT: v_readlane_b32 s66, v4, 18 +; VI-NEXT: v_readlane_b32 s65, v4, 17 +; VI-NEXT: v_readlane_b32 s64, v4, 16 +; VI-NEXT: v_readlane_b32 s55, v4, 15 +; VI-NEXT: v_readlane_b32 s54, v4, 14 +; VI-NEXT: v_readlane_b32 s53, v4, 13 +; VI-NEXT: v_readlane_b32 s52, v4, 12 +; VI-NEXT: v_readlane_b32 s51, v4, 11 +; VI-NEXT: v_readlane_b32 s50, v4, 10 +; VI-NEXT: v_readlane_b32 s49, v4, 9 +; VI-NEXT: v_readlane_b32 s48, v4, 8 +; VI-NEXT: v_readlane_b32 s39, v4, 7 +; VI-NEXT: v_readlane_b32 s38, v4, 6 +; VI-NEXT: v_readlane_b32 s37, v4, 5 +; VI-NEXT: v_readlane_b32 s36, v4, 4 +; VI-NEXT: v_readlane_b32 s35, v4, 3 +; VI-NEXT: v_readlane_b32 s34, v4, 2 +; VI-NEXT: v_readlane_b32 s31, v4, 1 +; VI-NEXT: v_readlane_b32 s30, v4, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB109_4: +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr76 ; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr67 ; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr65 ; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr10 ; VI-NEXT: ; implicit-def: $sgpr55 ; VI-NEXT: ; implicit-def: $sgpr54 -; VI-NEXT: ; implicit-def: $sgpr52 -; VI-NEXT: ; implicit-def: $sgpr8 ; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr52 ; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr12 ; VI-NEXT: ; implicit-def: $sgpr50 ; VI-NEXT: ; implicit-def: $sgpr49 -; VI-NEXT: ; implicit-def: $sgpr39 -; VI-NEXT: ; implicit-def: $sgpr10 ; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr39 ; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr14 ; VI-NEXT: ; implicit-def: $sgpr37 ; VI-NEXT: ; implicit-def: $sgpr36 -; VI-NEXT: ; implicit-def: $sgpr34 -; VI-NEXT: ; implicit-def: $sgpr12 ; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 ; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr40 ; VI-NEXT: ; implicit-def: $sgpr30 ; VI-NEXT: ; implicit-def: $sgpr91 -; VI-NEXT: ; implicit-def: $sgpr89 -; VI-NEXT: ; implicit-def: $sgpr14 ; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 ; VI-NEXT: ; implicit-def: $sgpr88 -; VI-NEXT: ; implicit-def: $sgpr79 -; VI-NEXT: ; implicit-def: $sgpr78 -; VI-NEXT: ; implicit-def: $sgpr76 -; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr77 -; VI-NEXT: ; implicit-def: $sgpr75 -; VI-NEXT: ; implicit-def: $sgpr74 -; VI-NEXT: ; implicit-def: $sgpr73 -; VI-NEXT: ; implicit-def: $sgpr63 -; VI-NEXT: ; implicit-def: $sgpr42 -; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr62 -; VI-NEXT: ; implicit-def: $sgpr61 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: ; implicit-def: $sgpr44 -; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr42 ; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: ; implicit-def: $sgpr7 ; VI-NEXT: s_branch .LBB109_2 -; VI-NEXT: .LBB109_4: -; VI-NEXT: v_mov_b32_e32 v19, s44 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v19, s42 -; VI-NEXT: v_mov_b32_e32 v1, s16 -; VI-NEXT: v_mov_b32_e32 v2, s17 -; VI-NEXT: v_mov_b32_e32 v3, s18 -; VI-NEXT: v_mov_b32_e32 v4, s19 -; VI-NEXT: v_mov_b32_e32 v5, s20 -; VI-NEXT: v_mov_b32_e32 v6, s21 -; VI-NEXT: v_mov_b32_e32 v7, s22 -; VI-NEXT: v_mov_b32_e32 v8, s23 -; VI-NEXT: v_mov_b32_e32 v9, s24 -; VI-NEXT: v_mov_b32_e32 v10, s25 -; VI-NEXT: v_mov_b32_e32 v11, s26 -; VI-NEXT: v_mov_b32_e32 v12, s27 -; VI-NEXT: v_mov_b32_e32 v13, s28 -; VI-NEXT: v_mov_b32_e32 v14, s29 -; VI-NEXT: v_mov_b32_e32 v15, s4 -; VI-NEXT: v_mov_b32_e32 v16, s5 -; VI-NEXT: v_mov_b32_e32 v18, s67 -; VI-NEXT: v_mov_b32_e32 v62, s65 -; VI-NEXT: v_mov_b32_e32 v17, s66 -; VI-NEXT: v_mov_b32_e32 v60, s64 -; VI-NEXT: v_mov_b32_e32 v61, s55 -; VI-NEXT: v_mov_b32_e32 v58, s54 -; VI-NEXT: v_mov_b32_e32 v59, s52 -; VI-NEXT: v_mov_b32_e32 v57, s53 -; VI-NEXT: v_mov_b32_e32 v47, s51 -; VI-NEXT: v_mov_b32_e32 v56, s50 -; VI-NEXT: v_mov_b32_e32 v46, s49 -; VI-NEXT: v_mov_b32_e32 v45, s39 -; VI-NEXT: v_mov_b32_e32 v44, s48 -; VI-NEXT: v_mov_b32_e32 v42, s38 -; VI-NEXT: v_mov_b32_e32 v43, s37 -; VI-NEXT: v_mov_b32_e32 v41, s36 -; VI-NEXT: v_mov_b32_e32 v40, s34 -; VI-NEXT: v_mov_b32_e32 v55, s35 -; VI-NEXT: v_mov_b32_e32 v53, s31 -; VI-NEXT: v_mov_b32_e32 v54, s30 -; VI-NEXT: v_mov_b32_e32 v52, s91 -; VI-NEXT: v_mov_b32_e32 v51, s89 -; VI-NEXT: v_mov_b32_e32 v50, s90 -; VI-NEXT: v_mov_b32_e32 v48, s88 -; VI-NEXT: v_mov_b32_e32 v49, s79 -; VI-NEXT: v_mov_b32_e32 v39, s78 -; VI-NEXT: v_mov_b32_e32 v38, s76 -; VI-NEXT: v_mov_b32_e32 v37, s77 -; VI-NEXT: v_mov_b32_e32 v35, s75 -; VI-NEXT: v_mov_b32_e32 v36, s74 -; VI-NEXT: v_mov_b32_e32 v34, s73 -; VI-NEXT: v_mov_b32_e32 v33, s63 -; VI-NEXT: v_mov_b32_e32 v32, s72 -; VI-NEXT: v_mov_b32_e32 v30, s62 -; VI-NEXT: v_mov_b32_e32 v31, s61 -; VI-NEXT: v_mov_b32_e32 v29, s60 -; VI-NEXT: v_mov_b32_e32 v28, s58 -; VI-NEXT: v_mov_b32_e32 v27, s59 -; VI-NEXT: v_mov_b32_e32 v25, s57 -; VI-NEXT: v_mov_b32_e32 v26, s56 -; VI-NEXT: v_mov_b32_e32 v21, s12 -; VI-NEXT: v_mov_b32_e32 v22, s10 -; VI-NEXT: v_mov_b32_e32 v23, s8 -; VI-NEXT: v_mov_b32_e32 v24, s6 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v19, s40 -; VI-NEXT: v_mov_b32_e32 v20, s14 -; VI-NEXT: .LBB109_5: ; %end -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v24 -; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v62, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v58 -; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v57 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v56 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v46 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 -; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 -; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v41 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v21 -; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v54 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v20 -; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 -; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 -; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v36 -; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_readlane_b32 s67, v63, 19 -; VI-NEXT: v_readlane_b32 s66, v63, 18 -; VI-NEXT: v_readlane_b32 s65, v63, 17 -; VI-NEXT: v_readlane_b32 s64, v63, 16 -; VI-NEXT: v_readlane_b32 s55, v63, 15 -; VI-NEXT: v_readlane_b32 s54, v63, 14 -; VI-NEXT: v_readlane_b32 s53, v63, 13 -; VI-NEXT: v_readlane_b32 s52, v63, 12 -; VI-NEXT: v_readlane_b32 s51, v63, 11 -; VI-NEXT: v_readlane_b32 s50, v63, 10 -; VI-NEXT: v_readlane_b32 s49, v63, 9 -; VI-NEXT: v_readlane_b32 s48, v63, 8 -; VI-NEXT: v_readlane_b32 s39, v63, 7 -; VI-NEXT: v_readlane_b32 s38, v63, 6 -; VI-NEXT: v_readlane_b32 s37, v63, 5 -; VI-NEXT: v_readlane_b32 s36, v63, 4 -; VI-NEXT: v_readlane_b32 s35, v63, 3 -; VI-NEXT: v_readlane_b32 s34, v63, 2 -; VI-NEXT: v_readlane_b32 s31, v63, 1 -; VI-NEXT: v_readlane_b32 s30, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 -; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v27 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v32bf16_to_v64i8_scalar: ; GFX9: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll index 14e17ce49cca0..1dcc010349123 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -2272,30 +2272,32 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[4:5], 16 +; SI-NEXT: v_mov_b32_e32 v1, v8 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: .LBB23_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: @@ -2311,42 +2313,43 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg ; VI-NEXT: s_cbranch_execnz .LBB23_4 ; VI-NEXT: .LBB23_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB23_3: ; VI-NEXT: s_branch .LBB23_2 @@ -5460,30 +5463,32 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[4:5], 16 +; SI-NEXT: v_mov_b32_e32 v1, v8 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: .LBB47_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: @@ -5499,42 +5504,43 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr ; VI-NEXT: s_cbranch_execnz .LBB47_4 ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB47_3: ; VI-NEXT: s_branch .LBB47_2 @@ -8361,30 +8367,32 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[4:5], 16 +; SI-NEXT: v_mov_b32_e32 v1, v8 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: .LBB67_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: @@ -8400,42 +8408,43 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB67_4 ; VI-NEXT: .LBB67_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB67_3: ; VI-NEXT: s_branch .LBB67_2 @@ -10937,30 +10946,32 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[4:5], 16 +; SI-NEXT: v_mov_b32_e32 v1, v8 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: .LBB83_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB83_4: @@ -10976,42 +10987,43 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB83_4 ; VI-NEXT: .LBB83_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB83_3: ; VI-NEXT: s_branch .LBB83_2 @@ -13151,37 +13163,38 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3 ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16 -; SI-NEXT: v_alignbit_b32 v0, v6, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: .LBB95_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v4 +; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_branch .LBB95_2 ; @@ -13194,42 +13207,43 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB95_4 ; VI-NEXT: .LBB95_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v3 +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB95_3: ; VI-NEXT: s_branch .LBB95_2 @@ -15062,42 +15076,43 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i ; VI-NEXT: s_cbranch_execnz .LBB103_4 ; VI-NEXT: .LBB103_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v3 +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB103_3: ; VI-NEXT: s_branch .LBB103_2 @@ -16737,52 +16752,54 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v12, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_alignbit_b32 v9, v1, v12, 16 -; SI-NEXT: v_alignbit_b32 v10, v6, v8, 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[9:10], 24 -; SI-NEXT: v_lshr_b64 v[4:5], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_lshr_b64 v[10:11], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[12:13], 16 +; SI-NEXT: v_mov_b32_e32 v1, v10 +; SI-NEXT: v_lshr_b64 v[8:9], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[0:1], 24 +; SI-NEXT: v_lshr_b64 v[1:2], v[0:1], 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v10 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_alignbit_b32 v10, v6, v1, 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[9:10], 24 -; SI-NEXT: v_lshr_b64 v[4:5], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[10:11], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mov_b32_e32 v1, v10 +; SI-NEXT: v_lshr_b64 v[3:4], v[0:1], 24 +; SI-NEXT: v_lshr_b64 v[8:9], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[0:1], 8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 ; SI-NEXT: .LBB109_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v9 -; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v2, v8 ; SI-NEXT: v_mov_b32_e32 v4, v10 +; SI-NEXT: v_mov_b32_e32 v5, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_branch .LBB109_2 ; @@ -16793,11 +16810,11 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 ; VI-NEXT: s_cbranch_scc0 .LBB109_3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 -; VI-NEXT: s_lshr_b32 s8, s17, 24 -; VI-NEXT: s_lshr_b32 s5, s17, 16 -; VI-NEXT: s_lshr_b32 s10, s17, 8 -; VI-NEXT: s_lshr_b32 s9, s16, 16 -; VI-NEXT: s_lshr_b32 s11, s16, 8 +; VI-NEXT: s_lshr_b32 s5, s17, 24 +; VI-NEXT: s_lshr_b32 s11, s17, 16 +; VI-NEXT: s_lshr_b32 s8, s17, 8 +; VI-NEXT: s_lshr_b32 s10, s16, 16 +; VI-NEXT: s_lshr_b32 s9, s16, 8 ; VI-NEXT: s_cbranch_execnz .LBB109_4 ; VI-NEXT: .LBB109_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 @@ -16810,58 +16827,59 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc -; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_lshrrev_b64 v[1:2], 16, v[5:6] +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v2, v6, v3, 16 -; VI-NEXT: v_alignbit_b32 v1, v0, v4, 16 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[2:3] +; VI-NEXT: v_mov_b32_e32 v10, v1 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[9:10] +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v9 ; VI-NEXT: v_mov_b32_e32 v4, v8 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB109_3: -; VI-NEXT: ; implicit-def: $sgpr11 ; VI-NEXT: ; implicit-def: $sgpr9 -; VI-NEXT: ; implicit-def: $sgpr4 ; VI-NEXT: ; implicit-def: $sgpr10 -; VI-NEXT: ; implicit-def: $sgpr5 +; VI-NEXT: ; implicit-def: $sgpr4 ; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr5 ; VI-NEXT: s_branch .LBB109_2 ; VI-NEXT: .LBB109_4: -; VI-NEXT: v_mov_b32_e32 v1, s11 -; VI-NEXT: v_mov_b32_e32 v2, s9 -; VI-NEXT: v_mov_b32_e32 v5, s10 -; VI-NEXT: v_mov_b32_e32 v7, s8 -; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_mov_b32_e32 v6, s11 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v7, s5 +; VI-NEXT: v_mov_b32_e32 v5, s8 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_mov_b32_e32 v4, s17 ; VI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index 6ada0cb8c46f1..4abb70e0ec5c9 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -2214,40 +2214,42 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s20 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshr_b64 v[0:1], v[7:8], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_lshr_b64 v[12:13], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[5:6], 16 +; SI-NEXT: v_mov_b32_e32 v2, v12 ; SI-NEXT: s_cbranch_execnz .LBB11_3 ; SI-NEXT: .LBB11_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v9 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_mov_b32_e32 v2, v3 ; SI-NEXT: .LBB11_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB11_4: @@ -2263,60 +2265,61 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB11_4 ; VI-NEXT: .LBB11_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v5 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v5 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v5 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v5 ; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v5 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v5 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB11_3: ; VI-NEXT: s_branch .LBB11_2 @@ -5430,40 +5433,42 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s20 ; SI-NEXT: s_cbranch_scc0 .LBB27_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshr_b64 v[0:1], v[7:8], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_lshr_b64 v[12:13], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[5:6], 16 +; SI-NEXT: v_mov_b32_e32 v2, v12 ; SI-NEXT: s_cbranch_execnz .LBB27_3 ; SI-NEXT: .LBB27_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v9 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_mov_b32_e32 v2, v3 ; SI-NEXT: .LBB27_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB27_4: @@ -5479,60 +5484,61 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB27_4 ; VI-NEXT: .LBB27_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v5 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v5 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v5 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v5 ; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v5 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v5 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB27_3: ; VI-NEXT: s_branch .LBB27_2 @@ -8098,70 +8104,73 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s20 ; SI-NEXT: s_cbranch_scc0 .LBB39_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 -; SI-NEXT: v_alignbit_b32 v12, v1, v18, 16 -; SI-NEXT: v_alignbit_b32 v13, v6, v16, 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[12:13], 24 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_lshr_b64 v[4:5], v[12:13], 16 -; SI-NEXT: v_alignbit_b32 v8, v10, v14, 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[12:13], 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v15 -; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v13 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v21 +; SI-NEXT: v_lshr_b64 v[14:15], v[5:6], 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v19 +; SI-NEXT: v_lshr_b64 v[0:1], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[9:10], 16 +; SI-NEXT: v_mov_b32_e32 v1, v14 +; SI-NEXT: v_lshr_b64 v[3:4], v[0:1], 24 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v19 +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v16, 8, v15 +; SI-NEXT: v_lshr_b64 v[12:13], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[0:1], 8 ; SI-NEXT: s_cbranch_execnz .LBB39_3 ; SI-NEXT: .LBB39_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_alignbit_b32 v13, v6, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_lshr_b64 v[3:4], v[12:13], 24 -; SI-NEXT: v_alignbit_b32 v8, v10, v1, 16 -; SI-NEXT: v_lshr_b64 v[4:5], v[12:13], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[12:13], 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v13 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[14:15], v[5:6], 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_mov_b32_e32 v1, v14 +; SI-NEXT: v_lshr_b64 v[15:16], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[0:1], 24 +; SI-NEXT: v_lshr_b64 v[12:13], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[0:1], 8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v16, 8, v15 ; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 -; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v5 ; SI-NEXT: .LBB39_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v12 -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v4, v13 +; SI-NEXT: v_mov_b32_e32 v2, v12 +; SI-NEXT: v_mov_b32_e32 v4, v14 +; SI-NEXT: v_mov_b32_e32 v5, v8 +; SI-NEXT: v_mov_b32_e32 v8, v15 +; SI-NEXT: v_mov_b32_e32 v9, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB39_4: -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_branch .LBB39_2 ; @@ -8171,110 +8180,110 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3 ; VI-NEXT: s_cmp_lg_u32 s19, 0 ; VI-NEXT: s_cbranch_scc0 .LBB39_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s19, s16, 8 -; VI-NEXT: s_lshr_b32 s10, s18, 16 -; VI-NEXT: s_lshr_b32 s11, s18, 8 +; VI-NEXT: s_lshr_b32 s19, s18, 16 +; VI-NEXT: s_lshr_b32 s15, s18, 8 ; VI-NEXT: s_lshr_b32 s12, s17, 24 -; VI-NEXT: s_lshr_b32 s13, s17, 16 -; VI-NEXT: s_lshr_b32 s15, s17, 8 +; VI-NEXT: s_lshr_b32 s11, s17, 16 +; VI-NEXT: s_lshr_b32 s10, s17, 8 ; VI-NEXT: s_lshr_b32 s14, s16, 16 +; VI-NEXT: s_lshr_b32 s13, s16, 8 ; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 ; VI-NEXT: s_cbranch_execnz .LBB39_4 ; VI-NEXT: .LBB39_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v3 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v8, v0, v1, 16 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, v4 ; VI-NEXT: v_mov_b32_e32 v9, 0x7fc07fc0 -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[14:15] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[0:1] ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v15 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_branch .LBB39_5 ; VI-NEXT: .LBB39_3: -; VI-NEXT: ; implicit-def: $sgpr19 +; VI-NEXT: ; implicit-def: $sgpr13 ; VI-NEXT: ; implicit-def: $sgpr14 ; VI-NEXT: ; implicit-def: $sgpr4 -; VI-NEXT: ; implicit-def: $sgpr15 -; VI-NEXT: ; implicit-def: $sgpr13 -; VI-NEXT: ; implicit-def: $sgpr12 -; VI-NEXT: ; implicit-def: $sgpr11 ; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr19 ; VI-NEXT: ; implicit-def: $sgpr6 ; VI-NEXT: s_branch .LBB39_2 ; VI-NEXT: .LBB39_4: -; VI-NEXT: v_mov_b32_e32 v14, s16 -; VI-NEXT: v_mov_b32_e32 v15, s17 ; VI-NEXT: v_mov_b32_e32 v8, s18 -; VI-NEXT: v_mov_b32_e32 v1, s19 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v10, s19 +; VI-NEXT: v_mov_b32_e32 v13, s15 ; VI-NEXT: v_mov_b32_e32 v2, s14 -; VI-NEXT: v_mov_b32_e32 v5, s15 -; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v7, s12 -; VI-NEXT: v_mov_b32_e32 v13, s11 -; VI-NEXT: v_mov_b32_e32 v10, s10 +; VI-NEXT: v_mov_b32_e32 v6, s11 +; VI-NEXT: v_mov_b32_e32 v5, s10 ; VI-NEXT: v_mov_b32_e32 v11, s6 -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v14, s4 ; VI-NEXT: .LBB39_5: ; %end -; VI-NEXT: v_mov_b32_e32 v0, v14 -; VI-NEXT: v_mov_b32_e32 v4, v15 +; VI-NEXT: v_mov_b32_e32 v3, v14 ; VI-NEXT: v_mov_b32_e32 v9, v13 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -11854,33 +11863,32 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i ; VI-NEXT: s_cbranch_execnz .LBB49_4 ; VI-NEXT: .LBB49_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v3 +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v3 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 @@ -11889,25 +11897,27 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i ; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v3 ; VI-NEXT: v_bfe_u32 v6, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_add_f32_e32 v3, s4, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc -; VI-NEXT: v_bfe_u32 v6, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v2, v0, v2, 16 -; VI-NEXT: v_alignbit_b32 v1, v5, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v4, v3, 16 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB49_3: ; VI-NEXT: s_branch .LBB49_2 @@ -12814,49 +12824,51 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3 ; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s21 ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v9 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshr_b64 v[6:7], v[4:5], 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: v_lshr_b64 v[6:7], v[1:2], 16 -; SI-NEXT: v_alignbit_b32 v4, v5, v9, 16 +; SI-NEXT: v_lshr_b64 v[7:8], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_mov_b32_e32 v2, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v6 +; SI-NEXT: v_mov_b32_e32 v2, v7 +; SI-NEXT: v_mov_b32_e32 v4, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_branch .LBB53_2 ; @@ -12869,33 +12881,32 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB53_4 ; VI-NEXT: .LBB53_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v3 +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v3 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 @@ -12904,25 +12915,27 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3 ; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v3 ; VI-NEXT: v_bfe_u32 v6, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_add_f32_e32 v3, s4, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc -; VI-NEXT: v_bfe_u32 v6, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v2, v0, v2, 16 -; VI-NEXT: v_alignbit_b32 v1, v5, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v4, v3, 16 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB53_3: ; VI-NEXT: s_branch .LBB53_2 diff --git a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll index 18cf120a1d299..61645200690f5 100644 --- a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll @@ -27,24 +27,26 @@ define amdgpu_kernel void @any_extend_vector_inreg_v16i8_to_v4i32(ptr addrspace( ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:1 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:3 -; GFX6-NEXT: s_lshr_b32 s8, s9, 16 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_lshr_b32 s7, s9, 16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[6:7], s[4:5], 8 -; GFX6-NEXT: v_mov_b32_e32 v1, s11 -; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:9 +; GFX6-NEXT: s_lshl_b64 s[8:9], s[4:5], 8 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s11 +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:9 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], 16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 -; GFX6-NEXT: v_alignbit_b32 v0, s8, v0, 16 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:12 +; GFX6-NEXT: s_lshr_b32 s5, s4, 8 +; GFX6-NEXT: s_lshr_b32 s4, s4, 24 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:12 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:7 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index f3885d6dadf9b..393e9fecbb308 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -46065,18 +46065,18 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> inreg %b, i32 %c) { ; GCN-LABEL: s_select_v3bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0 -; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s4 -; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s3 +; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1 +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0 +; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s4 +; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s3 ; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2 ; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16 +; GCN-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; GCN-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc @@ -46087,13 +46087,13 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> ; GFX7-LABEL: s_select_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 +; GFX7-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s3 +; GFX7-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2 ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s5 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -46203,22 +46203,22 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, i32 %c) { ; GCN-LABEL: s_select_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0 -; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s5 -; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s4 -; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s3 -; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s2 -; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s7 -; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v4, v7, v8, 16 +; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1 +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0 +; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s5 +; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s4 +; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s3 +; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2 +; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s7 +; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s6 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; GCN-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 +; GCN-NEXT: v_lshr_b64 v[3:4], v[5:6], 16 +; GCN-NEXT: v_lshr_b64 v[4:5], v[7:8], 16 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc @@ -46229,21 +46229,21 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; GFX7-LABEL: s_select_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 +; GFX7-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s5 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s4 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; GFX7-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s2 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2 +; GFX7-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s7 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s6 +; GFX7-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll index fbaaef0b29b66..37f4094806637 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -186,10 +186,12 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 5 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_alignbit_b32 v0, 5, s6, 16 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], 16 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index 08545b901581c..8532a7f716ba7 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -487,13 +487,13 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s3, s3, 16 +; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 16 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_lshr_b32 s0, s3, 16 ; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_alignbit_b32 v0, s0, v0, 16 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 30bcdf97e26fd..4ff8bf23638f1 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -5023,20 +5023,20 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> in ; GFX8-NEXT: s_add_i32 s6, s4, 0x7fff ; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s0, s0 ; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX8-NEXT: s_cselect_b32 s3, s3, s6 -; GFX8-NEXT: s_bfe_u32 s0, s1, 0x10010 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_or_b32 s4, s1, 0x400000 -; GFX8-NEXT: s_add_i32 s5, s0, 0x7fff -; GFX8-NEXT: v_cmp_u_f32_e64 s[0:1], s1, s1 -; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX8-NEXT: s_cselect_b32 s0, s4, s5 -; GFX8-NEXT: s_lshr_b32 s0, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 16 -; GFX8-NEXT: s_mov_b32 s0, 0x7fff7fff +; GFX8-NEXT: s_cselect_b32 s0, s3, s6 +; GFX8-NEXT: s_bfe_u32 s4, s1, 0x10010 +; GFX8-NEXT: s_add_i32 s4, s4, s1 +; GFX8-NEXT: s_or_b32 s3, s1, 0x400000 +; GFX8-NEXT: s_add_i32 s6, s4, 0x7fff +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s1, s1 +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s1, s3, s6 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX8-NEXT: v_bfi_b32 v0, s1, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -5185,29 +5185,29 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f64_sign_v2bf16(<2 x double> i ; GFX8-NEXT: s_addk_i32 s8, 0x7fff ; GFX8-NEXT: s_bitset1_b32 s5, 22 ; GFX8-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GFX8-NEXT: s_cselect_b32 s5, s5, s8 -; GFX8-NEXT: v_readfirstlane_b32 s8, v3 -; GFX8-NEXT: s_bitcmp1_b32 s8, 0 -; GFX8-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_and_b64 s[6:7], s[12:13], exec +; GFX8-NEXT: s_cselect_b32 s6, s5, s8 +; GFX8-NEXT: v_readfirstlane_b32 s5, v3 +; GFX8-NEXT: s_bitcmp1_b32 s5, 0 +; GFX8-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX8-NEXT: s_and_b64 s[8:9], s[12:13], exec ; GFX8-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], s[2:3] -; GFX8-NEXT: s_cselect_b32 s6, 1, -1 -; GFX8-NEXT: s_add_i32 s6, s8, s6 +; GFX8-NEXT: s_cselect_b32 s7, 1, -1 +; GFX8-NEXT: s_add_i32 s7, s5, s7 ; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX8-NEXT: s_cselect_b32 s0, s8, s6 +; GFX8-NEXT: s_cselect_b32 s0, s5, s7 ; GFX8-NEXT: s_bfe_u32 s1, s0, 0x10010 ; GFX8-NEXT: s_add_i32 s1, s1, s0 -; GFX8-NEXT: s_add_i32 s6, s1, 0x7fff +; GFX8-NEXT: s_add_i32 s5, s1, 0x7fff ; GFX8-NEXT: s_or_b32 s7, s0, 0x400000 ; GFX8-NEXT: s_and_b64 s[0:1], s[2:3], exec -; GFX8-NEXT: s_cselect_b32 s0, s7, s6 -; GFX8-NEXT: s_lshr_b32 s0, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 16 -; GFX8-NEXT: s_mov_b32 s0, 0x7fff7fff +; GFX8-NEXT: s_cselect_b32 s0, s7, s5 +; GFX8-NEXT: s_lshr_b32 s7, s0, 16 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], 16 +; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX8-NEXT: v_bfi_b32 v0, s1, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -5421,19 +5421,19 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> i ; GFX8-NEXT: s_addk_i32 s3, 0x7fff ; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s1, s1 ; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX8-NEXT: s_cselect_b32 s1, s1, s3 -; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10010 -; GFX8-NEXT: s_add_i32 s3, s3, s2 -; GFX8-NEXT: s_addk_i32 s3, 0x7fff -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s2, s2 -; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX8-NEXT: s_cselect_b32 s2, s2, s3 -; GFX8-NEXT: s_lshr_b32 s2, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_alignbit_b32 v0, s2, v0, 16 +; GFX8-NEXT: s_cselect_b32 s4, s1, s3 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x10010 +; GFX8-NEXT: s_add_i32 s1, s1, s2 +; GFX8-NEXT: s_addk_i32 s1, 0x7fff +; GFX8-NEXT: v_cmp_u_f32_e64 s[6:7], s2, s2 +; GFX8-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX8-NEXT: s_cselect_b32 s1, s2, s1 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[4:5], 16 ; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_bfi_b32 v0, s1, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll index 10c60dfc9b34c..5424ebfcffcd1 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll @@ -409,7 +409,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out, ; CI-NEXT: v_add_f32_e64 v1, s2, 2.0 ; CI-NEXT: v_add_f32_e64 v0, s3, 1.0 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; CI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -441,7 +441,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out, ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -709,16 +709,16 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2bf16(ptr addrspace(1) %out, <2 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s3, s2, 0x7fff0000 -; CI-NEXT: s_and_b32 s2, s2, 0x7fff -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_mul_f32_e64 v0, s3, -4.0 -; CI-NEXT: v_mul_f32_e64 v1, s2, -4.0 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_and_b32 s3, s2, 0x7fff +; CI-NEXT: s_and_b32 s2, s2, 0x7fff0000 +; CI-NEXT: v_mul_f32_e64 v0, s2, -4.0 +; CI-NEXT: s_lshl_b32 s2, s3, 16 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: v_mul_f32_e64 v0, s2, -4.0 +; CI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: flat_store_dword v[1:2], v0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: fold_user_fneg_fabs_v2bf16: @@ -749,10 +749,10 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2bf16(ptr addrspace(1) %out, <2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v2, v1, v0, 16 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fold_user_fneg_fabs_v2bf16: @@ -956,17 +956,17 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2bf16(ptr addrspa ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: s_and_b32 s1, s4, 0x7fff +; CI-NEXT: s_and_b32 s2, s4, 0x7fff0000 +; CI-NEXT: v_mul_f32_e64 v4, s2, -4.0 ; CI-NEXT: s_lshl_b32 s1, s1, 16 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; CI-NEXT: v_mul_f32_e64 v4, s1, -4.0 -; CI-NEXT: s_and_b32 s1, s4, 0x7fff0000 -; CI-NEXT: v_mul_f32_e64 v5, s1, -4.0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; CI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; CI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 ; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_store_dword v[0:1], v5 ; CI-NEXT: flat_store_dword v[2:3], v4 @@ -1000,10 +1000,10 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2bf16(ptr addrspa ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] ; VI-NEXT: v_mov_b32_e32 v5, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll index 84b904ff67151..63aadaacbeb3a 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll @@ -627,18 +627,18 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_load_dword v2, v[0:1] +; CI-NEXT: flat_load_dword v1, v[0:1] ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_xor_b32_e32 v3, 0x8000, v2 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_mul_f32_e64 v2, -v2, v2 -; CI-NEXT: v_mul_f32_e32 v3, v3, v4 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; CI-NEXT: v_xor_b32_e32 v2, 0x8000, v1 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_mul_f32_e64 v4, -v1, v1 +; CI-NEXT: v_mul_f32_e32 v1, v2, v3 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; CI-NEXT: v_lshr_b64 v[2:3], v[1:2], 16 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -648,34 +648,34 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa ; GFX8-NEXT: s_add_i32 s12, s12, s17 ; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX8-NEXT: v_mov_b32_e32 v3, 0x8000 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x8000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_xor_b32_sdwa v5, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_mul_f32_e32 v3, v5, v4 -; GFX8-NEXT: v_mul_f32_e32 v2, v2, v6 -; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_xor_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_xor_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mul_f32_e32 v2, v4, v3 +; GFX8-NEXT: v_mul_f32_e32 v3, v1, v5 +; GFX8-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], 16, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index 68b95cd9adbf3..72c2003058a01 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -18,12 +18,15 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: s_lshr_b32 s1, s0, 1 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; SI-NEXT: s_not_b32 s0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_alignbit_b32 v0, s1, v0, v1 +; SI-NEXT: s_mov_b32 s8, s1 +; SI-NEXT: s_mov_b32 s9, s0 +; SI-NEXT: s_lshr_b32 s3, s0, 1 +; SI-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 +; SI-NEXT: s_not_b32 s2, s2 +; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_and_b32 s2, s2, 31 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -32,14 +35,17 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: s_mov_b32 s6, s1 +; VI-NEXT: s_mov_b32 s7, s0 +; VI-NEXT: s_lshr_b32 s3, s0, 1 ; VI-NEXT: s_not_b32 s2, s2 -; VI-NEXT: s_lshr_b32 s1, s0, 1 -; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_alignbit_b32 v2, s1, v0, v1 +; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 +; VI-NEXT: s_mov_b32 s1, s3 +; VI-NEXT: s_and_b32 s2, s2, 31 +; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -49,12 +55,15 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_mov_b32 s4, s1 +; GFX9-NEXT: s_mov_b32 s5, s0 +; GFX9-NEXT: s_lshr_b32 s3, s0, 1 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX9-NEXT: s_not_b32 s2, s2 -; GFX9-NEXT: s_lshr_b32 s1, s0, 1 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, v2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_and_b32 s2, s2, 31 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -77,13 +86,18 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 1 -; GFX10-NEXT: s_lshr_b32 s0, s0, 1 -; GFX10-NEXT: s_not_b32 s1, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[6:7] +; GFX10-NEXT: s_mov_b32 s4, s1 +; GFX10-NEXT: s_mov_b32 s5, s0 +; GFX10-NEXT: s_lshr_b32 s3, s0, 1 +; GFX10-NEXT: s_not_b32 s2, s2 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_and_b32 s2, s2, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32: @@ -91,14 +105,18 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1 -; GFX11-NEXT: s_lshr_b32 s0, s0, 1 -; GFX11-NEXT: s_not_b32 s1, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_mov_b32 s6, s1 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_lshr_b32 s3, s0, 1 +; GFX11-NEXT: s_not_b32 s2, s2 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_and_b32 s2, s2, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) @@ -113,10 +131,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_alignbit_b32 v0, s2, v0, 25 +; SI-NEXT: s_mov_b32 s0, s3 +; SI-NEXT: s_mov_b32 s1, s2 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 25 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -124,10 +144,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25 +; VI-NEXT: s_mov_b32 s4, s3 +; VI-NEXT: s_mov_b32 s5, s2 +; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], 25 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -136,8 +158,10 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 25 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s2 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 25 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -158,16 +182,22 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25 +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s5, s2 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], 25 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25 +; GFX11-NEXT: s_mov_b32 s4, s3 +; GFX11-NEXT: s_mov_b32 s5, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[2:3], s[4:5], 25 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm entry: @@ -185,41 +215,51 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v0, s1, v0, 1 -; SI-NEXT: s_not_b32 s3, s5 -; SI-NEXT: s_lshr_b32 s1, s1, 1 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_not_b32 s1, s4 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s0, 1 -; SI-NEXT: v_mov_b32_e32 v2, s1 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2 +; SI-NEXT: s_mov_b32 s6, s3 +; SI-NEXT: s_mov_b32 s7, s1 +; SI-NEXT: s_lshr_b32 s12, s1, 1 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 +; SI-NEXT: s_not_b32 s1, s5 +; SI-NEXT: s_mov_b32 s7, s12 +; SI-NEXT: s_and_b32 s1, s1, 31 +; SI-NEXT: s_mov_b32 s3, s0 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 +; SI-NEXT: s_lshr_b32 s5, s0, 1 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 +; SI-NEXT: s_not_b32 s2, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_and_b32 s2, s2, 31 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v2i32: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: s_not_b32 s7, s7 -; VI-NEXT: s_lshr_b32 s3, s1, 1 -; VI-NEXT: v_alignbit_b32 v0, s1, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_alignbit_b32 v1, s3, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_not_b32 s1, s6 -; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; VI-NEXT: s_lshr_b32 s0, s0, 1 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_alignbit_b32 v0, s0, v0, v2 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: s_mov_b32 s8, s3 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_lshr_b32 s10, s1, 1 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], 1 +; VI-NEXT: s_not_b32 s1, s5 +; VI-NEXT: s_mov_b32 s9, s10 +; VI-NEXT: s_and_b32 s1, s1, 31 +; VI-NEXT: s_mov_b32 s3, s0 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 +; VI-NEXT: s_lshr_b32 s5, s0, 1 +; VI-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 +; VI-NEXT: s_not_b32 s2, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_and_b32 s2, s2, 31 +; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -230,18 +270,23 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 1 -; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, 1 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_lshr_b32 s10, s1, 1 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 ; GFX9-NEXT: s_not_b32 s1, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_not_b32 s1, s8 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 +; GFX9-NEXT: s_mov_b32 s5, s10 +; GFX9-NEXT: s_and_b32 s1, s1, 31 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s1 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_lshr_b32 s5, s0, 1 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 +; GFX9-NEXT: s_not_b32 s2, s8 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_and_b32 s2, s2, 31 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -271,14 +316,23 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s1, s3, 1 -; GFX10-NEXT: v_alignbit_b32 v3, s0, s2, 1 -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 -; GFX10-NEXT: s_not_b32 s2, s7 -; GFX10-NEXT: s_lshr_b32 s0, s0, 1 -; GFX10-NEXT: s_not_b32 s3, s6 -; GFX10-NEXT: v_alignbit_b32 v1, s1, v0, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s0, v3, s3 +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_lshr_b32 s10, s1, 1 +; GFX10-NEXT: s_not_b32 s7, s7 +; GFX10-NEXT: s_lshr_b32 s11, s0, 1 +; GFX10-NEXT: s_not_b32 s6, s6 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX10-NEXT: s_and_b32 s4, s7, 31 +; GFX10-NEXT: s_and_b32 s5, s6, 31 +; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_mov_b32 s1, s10 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s5 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; @@ -288,16 +342,25 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s1, s3, 1 -; GFX11-NEXT: v_alignbit_b32 v3, s0, s2, 1 -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 -; GFX11-NEXT: s_not_b32 s2, s7 -; GFX11-NEXT: s_lshr_b32 s0, s0, 1 -; GFX11-NEXT: s_not_b32 s3, s6 -; GFX11-NEXT: v_alignbit_b32 v1, s1, v0, s2 -; GFX11-NEXT: v_alignbit_b32 v0, s0, v3, s3 +; GFX11-NEXT: s_mov_b32 s8, s3 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_lshr_b32 s10, s1, 1 +; GFX11-NEXT: s_not_b32 s7, s7 +; GFX11-NEXT: s_lshr_b32 s11, s0, 1 +; GFX11-NEXT: s_not_b32 s6, s6 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: s_and_b32 s7, s7, 31 +; GFX11-NEXT: s_and_b32 s6, s6, 31 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s1, s10 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: @@ -314,10 +377,13 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, 23 -; SI-NEXT: v_alignbit_b32 v0, s0, v2, 25 +; SI-NEXT: s_mov_b32 s8, s3 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s3, s0 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 23 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], 25 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s8 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -326,11 +392,14 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_alignbit_b32 v1, s1, v0, 23 -; VI-NEXT: v_alignbit_b32 v0, s0, v2, 25 +; VI-NEXT: s_mov_b32 s6, s3 +; VI-NEXT: s_mov_b32 s7, s1 +; VI-NEXT: s_mov_b32 s3, s0 +; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], 23 +; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 25 ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -341,10 +410,13 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 23 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 25 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 23 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 25 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -369,8 +441,13 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 23 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 25 +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[2:3], 25 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], 23 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; @@ -379,10 +456,15 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 23 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 25 +; GFX11-NEXT: s_mov_b32 s6, s3 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s7, s1 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[2:3], 25 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], 23 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: @@ -395,104 +477,134 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; SI-LABEL: fshl_v4i32: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd -; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x15 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x15 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_not_b32 s5, s19 -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_alignbit_b32 v0, s11, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s11, 1 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_alignbit_b32 v3, s4, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s14 -; SI-NEXT: s_not_b32 s5, s18 -; SI-NEXT: v_alignbit_b32 v0, s10, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s10, 1 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_alignbit_b32 v2, s4, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: s_not_b32 s5, s17 -; SI-NEXT: v_alignbit_b32 v0, s9, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s9, 1 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_alignbit_b32 v1, s4, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: s_not_b32 s5, s16 -; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s8, 1 -; SI-NEXT: v_mov_b32_e32 v4, s5 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 +; SI-NEXT: s_mov_b32 s16, s15 +; SI-NEXT: s_mov_b32 s17, s11 +; SI-NEXT: s_lshr_b32 s18, s11, 1 +; SI-NEXT: s_lshr_b64 s[16:17], s[16:17], 1 +; SI-NEXT: s_not_b32 s7, s7 +; SI-NEXT: s_mov_b32 s17, s18 +; SI-NEXT: s_and_b32 s7, s7, 31 +; SI-NEXT: s_mov_b32 s15, s10 +; SI-NEXT: s_lshr_b64 s[16:17], s[16:17], s7 +; SI-NEXT: s_lshr_b32 s7, s10, 1 +; SI-NEXT: s_lshr_b64 s[10:11], s[14:15], 1 +; SI-NEXT: s_not_b32 s6, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_and_b32 s6, s6, 31 +; SI-NEXT: s_lshr_b64 s[6:7], s[10:11], s6 +; SI-NEXT: s_mov_b32 s10, s13 +; SI-NEXT: s_mov_b32 s11, s9 +; SI-NEXT: s_lshr_b32 s7, s9, 1 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], 1 +; SI-NEXT: s_not_b32 s5, s5 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_and_b32 s5, s5, 31 +; SI-NEXT: s_mov_b32 s13, s8 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s5 +; SI-NEXT: s_lshr_b32 s5, s8, 1 +; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; SI-NEXT: s_not_b32 s4, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_and_b32 s4, s4, 31 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v4i32: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s15 +; VI-NEXT: s_mov_b32 s4, s15 +; VI-NEXT: s_mov_b32 s5, s11 +; VI-NEXT: s_lshr_b32 s16, s11, 1 +; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 ; VI-NEXT: s_not_b32 s3, s3 -; VI-NEXT: s_lshr_b32 s6, s11, 1 -; VI-NEXT: v_alignbit_b32 v0, s11, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_alignbit_b32 v3, s6, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s14 -; VI-NEXT: s_not_b32 s2, s2 -; VI-NEXT: v_alignbit_b32 v0, s10, v0, 1 +; VI-NEXT: s_mov_b32 s5, s16 +; VI-NEXT: s_and_b32 s3, s3, 31 +; VI-NEXT: s_mov_b32 s15, s10 +; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], s3 ; VI-NEXT: s_lshr_b32 s3, s10, 1 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_alignbit_b32 v2, s3, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s13 +; VI-NEXT: s_lshr_b64 s[10:11], s[14:15], 1 +; VI-NEXT: s_not_b32 s2, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_and_b32 s2, s2, 31 +; VI-NEXT: s_lshr_b64 s[2:3], s[10:11], s2 +; VI-NEXT: s_mov_b32 s10, s13 +; VI-NEXT: s_mov_b32 s11, s9 +; VI-NEXT: s_lshr_b32 s3, s9, 1 +; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], 1 ; VI-NEXT: s_not_b32 s1, s1 -; VI-NEXT: v_alignbit_b32 v0, s9, v0, 1 -; VI-NEXT: s_lshr_b32 s2, s9, 1 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_alignbit_b32 v1, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s12 -; VI-NEXT: s_not_b32 s0, s0 -; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_and_b32 s1, s1, 31 +; VI-NEXT: s_mov_b32 s13, s8 +; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], s1 ; VI-NEXT: s_lshr_b32 s1, s8, 1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_alignbit_b32 v0, s1, v0, v4 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; VI-NEXT: s_not_b32 s0, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_and_b32 s0, s0, 31 +; VI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s10 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v4i32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s15 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: s_lshr_b32 s16, s11, 1 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 ; GFX9-NEXT: s_not_b32 s3, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s15 -; GFX9-NEXT: s_lshr_b32 s4, s11, 1 -; GFX9-NEXT: v_alignbit_b32 v0, s11, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v3, s4, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s14 -; GFX9-NEXT: s_not_b32 s2, s2 -; GFX9-NEXT: v_alignbit_b32 v0, s10, v0, 1 +; GFX9-NEXT: s_mov_b32 s5, s16 +; GFX9-NEXT: s_and_b32 s3, s3, 31 +; GFX9-NEXT: s_mov_b32 s15, s10 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s3 ; GFX9-NEXT: s_lshr_b32 s3, s10, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_alignbit_b32 v2, s3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[14:15], 1 +; GFX9-NEXT: s_not_b32 s2, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_and_b32 s2, s2, 31 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[10:11], s2 +; GFX9-NEXT: s_mov_b32 s10, s13 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_lshr_b32 s3, s9, 1 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], 1 ; GFX9-NEXT: s_not_b32 s1, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s9, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s9, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: s_not_b32 s0, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_and_b32 s1, s1, 31 +; GFX9-NEXT: s_mov_b32 s13, s8 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s1 ; GFX9-NEXT: s_lshr_b32 s1, s8, 1 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v5 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 31 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -530,22 +642,40 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s11, s15, 1 -; GFX10-NEXT: v_alignbit_b32 v1, s10, s14, 1 -; GFX10-NEXT: v_alignbit_b32 v5, s9, s13, 1 -; GFX10-NEXT: v_alignbit_b32 v6, s8, s12, 1 -; GFX10-NEXT: s_lshr_b32 s4, s11, 1 -; GFX10-NEXT: s_not_b32 s3, s3 -; GFX10-NEXT: s_lshr_b32 s5, s10, 1 -; GFX10-NEXT: s_not_b32 s2, s2 -; GFX10-NEXT: s_lshr_b32 s9, s9, 1 +; GFX10-NEXT: s_mov_b32 s4, s15 +; GFX10-NEXT: s_mov_b32 s5, s11 +; GFX10-NEXT: s_mov_b32 s15, s10 +; GFX10-NEXT: s_lshr_b32 s16, s11, 1 +; GFX10-NEXT: s_not_b32 s11, s3 +; GFX10-NEXT: s_lshr_b32 s17, s10, 1 +; GFX10-NEXT: s_not_b32 s10, s2 +; GFX10-NEXT: s_lshr_b32 s18, s9, 1 +; GFX10-NEXT: s_mov_b32 s2, s13 +; GFX10-NEXT: s_mov_b32 s3, s9 +; GFX10-NEXT: s_lshr_b32 s19, s8, 1 +; GFX10-NEXT: s_mov_b32 s13, s8 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 +; GFX10-NEXT: s_and_b32 s11, s11, 31 +; GFX10-NEXT: s_and_b32 s10, s10, 31 +; GFX10-NEXT: s_mov_b32 s5, s16 +; GFX10-NEXT: s_mov_b32 s9, s17 ; GFX10-NEXT: s_not_b32 s1, s1 -; GFX10-NEXT: s_lshr_b32 s8, s8, 1 ; GFX10-NEXT: s_not_b32 s0, s0 -; GFX10-NEXT: v_alignbit_b32 v3, s4, v0, s3 -; GFX10-NEXT: v_alignbit_b32 v2, s5, v1, s2 -; GFX10-NEXT: v_alignbit_b32 v1, s9, v5, s1 -; GFX10-NEXT: v_alignbit_b32 v0, s8, v6, s0 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s11 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s10 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[12:13], 1 +; GFX10-NEXT: s_mov_b32 s3, s18 +; GFX10-NEXT: s_mov_b32 s11, s19 +; GFX10-NEXT: s_and_b32 s0, s0, 31 +; GFX10-NEXT: s_and_b32 s5, s1, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[10:11], s0 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX10-NEXT: s_endpgm ; @@ -555,24 +685,41 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s11, s15, 1 -; GFX11-NEXT: v_alignbit_b32 v1, s10, s14, 1 -; GFX11-NEXT: v_alignbit_b32 v5, s9, s13, 1 -; GFX11-NEXT: v_alignbit_b32 v6, s8, s12, 1 -; GFX11-NEXT: s_lshr_b32 s6, s11, 1 -; GFX11-NEXT: s_not_b32 s3, s3 -; GFX11-NEXT: s_lshr_b32 s7, s10, 1 -; GFX11-NEXT: s_not_b32 s2, s2 -; GFX11-NEXT: s_lshr_b32 s9, s9, 1 +; GFX11-NEXT: s_mov_b32 s6, s15 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s15, s10 +; GFX11-NEXT: s_lshr_b32 s16, s11, 1 +; GFX11-NEXT: s_not_b32 s11, s3 +; GFX11-NEXT: s_lshr_b32 s17, s10, 1 +; GFX11-NEXT: s_not_b32 s10, s2 +; GFX11-NEXT: s_lshr_b32 s18, s9, 1 +; GFX11-NEXT: s_mov_b32 s2, s13 +; GFX11-NEXT: s_mov_b32 s3, s9 +; GFX11-NEXT: s_lshr_b32 s19, s8, 1 +; GFX11-NEXT: s_mov_b32 s13, s8 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 +; GFX11-NEXT: s_and_b32 s11, s11, 31 +; GFX11-NEXT: s_and_b32 s10, s10, 31 +; GFX11-NEXT: s_mov_b32 s7, s16 +; GFX11-NEXT: s_mov_b32 s9, s17 ; GFX11-NEXT: s_not_b32 s1, s1 -; GFX11-NEXT: s_lshr_b32 s8, s8, 1 ; GFX11-NEXT: s_not_b32 s0, s0 -; GFX11-NEXT: v_alignbit_b32 v3, s6, v0, s3 -; GFX11-NEXT: v_alignbit_b32 v2, s7, v1, s2 -; GFX11-NEXT: v_alignbit_b32 v1, s9, v5, s1 -; GFX11-NEXT: v_alignbit_b32 v0, s8, v6, s0 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s11 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s10 +; GFX11-NEXT: s_lshr_b64 s[10:11], s[12:13], 1 +; GFX11-NEXT: s_mov_b32 s3, s18 +; GFX11-NEXT: s_mov_b32 s11, s19 +; GFX11-NEXT: s_and_b32 s0, s0, 31 +; GFX11-NEXT: s_and_b32 s7, s1, 31 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[10:11], s0 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s6 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] ; GFX11-NEXT: s_endpgm entry: @@ -589,14 +736,20 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_mov_b32_e32 v1, s14 -; SI-NEXT: v_alignbit_b32 v3, s11, v0, 31 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_alignbit_b32 v2, s10, v1, 23 -; SI-NEXT: v_alignbit_b32 v1, s9, v0, 25 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_alignbit_b32 v0, s8, v0, 31 +; SI-NEXT: s_mov_b32 s4, s15 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_mov_b32 s15, s10 +; SI-NEXT: s_mov_b32 s10, s13 +; SI-NEXT: s_mov_b32 s11, s9 +; SI-NEXT: s_mov_b32 s13, s8 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 31 +; SI-NEXT: s_lshr_b64 s[6:7], s[14:15], 23 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], 25 +; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], 31 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -605,15 +758,21 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s15 -; VI-NEXT: v_mov_b32_e32 v1, s14 -; VI-NEXT: v_mov_b32_e32 v4, s13 -; VI-NEXT: v_alignbit_b32 v3, s11, v0, 31 -; VI-NEXT: v_alignbit_b32 v2, s10, v1, 23 -; VI-NEXT: v_alignbit_b32 v1, s9, v4, 25 -; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: s_mov_b32 s2, s15 +; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s10 +; VI-NEXT: s_mov_b32 s6, s13 +; VI-NEXT: s_mov_b32 s7, s9 +; VI-NEXT: s_mov_b32 s13, s8 +; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 31 +; VI-NEXT: s_lshr_b64 s[4:5], s[14:15], 23 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 25 +; VI-NEXT: s_lshr_b64 s[8:9], s[12:13], 31 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_alignbit_b32 v0, s8, v0, 31 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -624,14 +783,20 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s15 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, 31 -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_alignbit_b32 v2, s10, v1, 23 -; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, 25 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 31 +; GFX9-NEXT: s_mov_b32 s2, s15 +; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s15, s10 +; GFX9-NEXT: s_mov_b32 s6, s13 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s13, s8 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 31 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], 23 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], 25 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[12:13], 31 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -660,10 +825,20 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, 31 -; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, 23 -; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, 25 -; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, 31 +; GFX10-NEXT: s_mov_b32 s2, s15 +; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_mov_b32 s15, s10 +; GFX10-NEXT: s_mov_b32 s4, s13 +; GFX10-NEXT: s_mov_b32 s5, s9 +; GFX10-NEXT: s_mov_b32 s13, s8 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 31 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[14:15], 23 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[12:13], 31 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 25 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -672,12 +847,21 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v3, s11, s15, 31 -; GFX11-NEXT: v_alignbit_b32 v2, s10, s14, 23 -; GFX11-NEXT: v_alignbit_b32 v1, s9, s13, 25 -; GFX11-NEXT: v_alignbit_b32 v0, s8, s12, 31 +; GFX11-NEXT: s_mov_b32 s2, s15 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s15, s10 +; GFX11-NEXT: s_mov_b32 s4, s13 +; GFX11-NEXT: s_mov_b32 s5, s9 +; GFX11-NEXT: s_mov_b32 s13, s8 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 31 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[14:15], 23 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[12:13], 31 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 25 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index ef68f44bac203..7afb2cf317869 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -30,9 +30,11 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; SI-NEXT: s_mov_b32 s8, s1 +; SI-NEXT: s_mov_b32 s9, s0 +; SI-NEXT: s_and_b32 s0, s2, 31 +; SI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -41,11 +43,13 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_alignbit_b32 v2, s0, v0, v1 +; VI-NEXT: s_mov_b32 s6, s1 +; VI-NEXT: s_mov_b32 s7, s0 +; VI-NEXT: s_and_b32 s0, s2, 31 +; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], s0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -55,9 +59,11 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2 +; GFX9-NEXT: s_mov_b32 s4, s1 +; GFX9-NEXT: s_mov_b32 s5, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 31 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -77,62 +83,45 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[6:7] +; GFX10-NEXT: s_mov_b32 s4, s1 +; GFX10-NEXT: s_mov_b32 s5, s0 +; GFX10-NEXT: s_and_b32 s0, s2, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: fshr_i32: -; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, s1, v0.l -; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: fshr_i32: -; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s1, v0 -; GFX11-FAKE16-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-FAKE16-NEXT: s_endpgm -; -; GFX12-TRUE16-LABEL: fshr_i32: -; GFX12-TRUE16: ; %bb.0: ; %entry -; GFX12-TRUE16-NEXT: s_clause 0x1 -; GFX12-TRUE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c -; GFX12-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s0, s1, v0.l -; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-TRUE16-NEXT: s_endpgm -; -; GFX12-FAKE16-LABEL: fshr_i32: -; GFX12-FAKE16: ; %bb.0: ; %entry -; GFX12-FAKE16-NEXT: s_clause 0x1 -; GFX12-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c -; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s0, s1, v0 -; GFX12-FAKE16-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: fshr_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s1 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_and_b32 s0, s2, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[0:1], s[6:7], s0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fshr_i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s6, s1 +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_and_b32 s0, s2, 31 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshr_b64 s[0:1], s[6:7], s0 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX12-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) store i32 %0, ptr addrspace(1) %in @@ -146,10 +135,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_alignbit_b32 v0, s2, v0, 7 +; SI-NEXT: s_mov_b32 s0, s3 +; SI-NEXT: s_mov_b32 s1, s2 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 7 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -157,10 +148,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_alignbit_b32 v2, s2, v0, 7 +; VI-NEXT: s_mov_b32 s4, s3 +; VI-NEXT: s_mov_b32 s5, s2 +; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], 7 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -169,8 +162,10 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 7 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s2 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 7 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -191,25 +186,34 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7 +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s5, s2 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], 7 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7 +; GFX11-NEXT: s_mov_b32 s4, s3 +; GFX11-NEXT: s_mov_b32 s5, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[2:3], s[4:5], 7 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: fshr_i32_imm: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_alignbit_b32 v1, s2, s3, 7 +; GFX12-NEXT: s_mov_b32 s4, s3 +; GFX12-NEXT: s_mov_b32 s5, s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshr_b64 s[2:3], s[4:5], 7 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -218,22 +222,125 @@ entry: ret void } +define amdgpu_kernel void @fshr_i32_imm_src0(ptr addrspace(1) %in, i32 %x, i32 %y) { +; SI-LABEL: fshr_i32_imm_src0: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s9, 7 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s8, s3 +; SI-NEXT: s_and_b32 s0, s2, 31 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fshr_i32_imm_src0: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s5, 7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, s3 +; VI-NEXT: s_and_b32 s2, s2, 31 +; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fshr_i32_imm_src0: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s5, 7 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_and_b32 s2, s2, 31 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; R600-LABEL: fshr_i32_imm_src0: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; R600-NEXT: BIT_ALIGN_INT * T1.X, literal.x, KC0[2].W, KC0[2].Z, +; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) +; +; GFX10-LABEL: fshr_i32_imm_src0: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s5, 7 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_and_b32 s2, s2, 31 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fshr_i32_imm_src0: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s5, 7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s3 +; GFX11-NEXT: s_and_b32 s2, s2, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fshr_i32_imm_src0: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s5, 7 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s4, s3 +; GFX12-NEXT: s_and_b32 s2, s2, 31 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm +entry: + %0 = call i32 @llvm.fshr.i32(i32 7, i32 %y, i32 %x) + store i32 %0, ptr addrspace(1) %in + ret void +} + define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshr_v2i32: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s6, s3 +; SI-NEXT: s_mov_b32 s7, s1 +; SI-NEXT: s_and_b32 s1, s5, 31 +; SI-NEXT: s_mov_b32 s3, s0 +; SI-NEXT: s_and_b32 s0, s4, 31 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v2i32: @@ -242,13 +349,16 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_alignbit_b32 v0, s0, v2, v0 +; VI-NEXT: s_mov_b32 s8, s3 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_and_b32 s1, s7, 31 +; VI-NEXT: s_mov_b32 s3, s0 +; VI-NEXT: s_and_b32 s0, s6, 31 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 +; VI-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -260,12 +370,15 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_and_b32 s1, s7, 31 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_and_b32 s0, s6, 31 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s1 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm ; @@ -286,79 +399,62 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: fshr_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s7 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, v0 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9] +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_and_b32 s0, s6, 31 +; GFX10-NEXT: s_and_b32 s6, s7, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: fshr_v2i32: -; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_clause 0x2 -; GFX11-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, s1, s3, v0.l -; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, s2, v0.h -; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: fshr_v2i32: -; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s1, s3, v0 -; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s2, v2 -; GFX11-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[4:5] -; GFX11-FAKE16-NEXT: s_endpgm -; -; GFX12-TRUE16-LABEL: fshr_v2i32: -; GFX12-TRUE16: ; %bb.0: ; %entry -; GFX12-TRUE16-NEXT: s_clause 0x2 -; GFX12-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX12-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s7 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, s6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, s1, s3, v0.l -; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s0, s2, v0.h -; GFX12-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-TRUE16-NEXT: s_endpgm -; -; GFX12-FAKE16-LABEL: fshr_v2i32: -; GFX12-FAKE16: ; %bb.0: ; %entry -; GFX12-FAKE16-NEXT: s_clause 0x2 -; GFX12-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s1, s3, v0 -; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s0, s2, v2 -; GFX12-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[4:5] -; GFX12-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: fshr_v2i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s3 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_and_b32 s0, s6, 31 +; GFX11-NEXT: s_and_b32 s6, s7, 31 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[8:9], s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fshr_v2i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s3 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_and_b32 s0, s6, 31 +; GFX12-NEXT: s_and_b32 s6, s7, 31 +; GFX12-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[8:9], s6 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) store <2 x i32> %0, ptr addrspace(1) %in @@ -373,10 +469,13 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, 9 -; SI-NEXT: v_alignbit_b32 v0, s0, v2, 7 +; SI-NEXT: s_mov_b32 s8, s3 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s3, s0 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 9 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], 7 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s8 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -385,11 +484,14 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_alignbit_b32 v1, s1, v0, 9 -; VI-NEXT: v_alignbit_b32 v0, s0, v2, 7 +; VI-NEXT: s_mov_b32 s6, s3 +; VI-NEXT: s_mov_b32 s7, s1 +; VI-NEXT: s_mov_b32 s3, s0 +; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], 9 +; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 7 ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -400,10 +502,13 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 9 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 7 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 9 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -428,8 +533,13 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 9 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 7 +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[2:3], 7 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], 9 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; @@ -438,10 +548,15 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 9 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 7 +; GFX11-NEXT: s_mov_b32 s6, s3 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s7, s1 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[2:3], 7 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], 9 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm ; @@ -450,10 +565,15 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_alignbit_b32 v1, s1, s3, 9 -; GFX12-NEXT: v_alignbit_b32 v0, s0, s2, 7 +; GFX12-NEXT: s_mov_b32 s6, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s7, s1 +; GFX12-NEXT: s_lshr_b64 s[0:1], s[2:3], 7 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[6:7], 9 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-NEXT: s_endpgm entry: @@ -462,28 +582,173 @@ entry: ret void } -define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { -; SI-LABEL: fshr_v4i32: +define amdgpu_kernel void @fshr_v2i32_imm_src1(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { +; SI-LABEL: fshr_v2i32_imm_src1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x15 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s8, 9 +; SI-NEXT: s_mov_b32 s10, 7 ; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_and_b32 s1, s3, 31 +; SI-NEXT: s_mov_b32 s11, s0 +; SI-NEXT: s_and_b32 s0, s2, 31 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 +; SI-NEXT: s_lshr_b64 s[0:1], s[10:11], s0 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fshr_v2i32_imm_src1: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s6, 9 +; VI-NEXT: s_mov_b32 s8, 7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s7, s1 +; VI-NEXT: s_and_b32 s1, s3, 31 +; VI-NEXT: s_mov_b32 s9, s0 +; VI-NEXT: s_and_b32 s0, s2, 31 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 +; VI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fshr_v2i32_imm_src1: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s4, 9 +; GFX9-NEXT: s_mov_b32 s8, 7 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_and_b32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s9, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 31 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s1 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: s_endpgm +; +; R600-LABEL: fshr_v2i32_imm_src1: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, literal.x, KC0[3].Z, +; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) +; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, literal.x, KC0[3].Y, +; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: fshr_v2i32_imm_src1: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s4, 9 +; GFX10-NEXT: s_mov_b32 s8, 7 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s9, s0 +; GFX10-NEXT: s_and_b32 s0, s2, 31 +; GFX10-NEXT: s_and_b32 s2, s3, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fshr_v2i32_imm_src1: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s6, 9 +; GFX11-NEXT: s_mov_b32 s8, 7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s1 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_and_b32 s0, s2, 31 +; GFX11-NEXT: s_and_b32 s2, s3, 31 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fshr_v2i32_imm_src1: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s6, 9 +; GFX12-NEXT: s_mov_b32 s8, 7 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s7, s1 +; GFX12-NEXT: s_mov_b32 s9, s0 +; GFX12-NEXT: s_and_b32 s0, s2, 31 +; GFX12-NEXT: s_and_b32 s2, s3, 31 +; GFX12-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: s_endpgm +entry: + %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> , <2 x i32> %y) + store <2 x i32> %0, ptr addrspace(1) %in + ret void +} + +define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { +; SI-LABEL: fshr_v4i32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x15 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_alignbit_b32 v3, s11, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s14 -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_alignbit_b32 v2, s10, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_alignbit_b32 v1, s9, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_mov_b32_e32 v4, s0 -; SI-NEXT: v_alignbit_b32 v0, s8, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s4, s15 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_and_b32 s6, s19, 31 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 +; SI-NEXT: s_mov_b32 s15, s10 +; SI-NEXT: s_and_b32 s5, s18, 31 +; SI-NEXT: s_lshr_b64 s[6:7], s[14:15], s5 +; SI-NEXT: s_mov_b32 s10, s13 +; SI-NEXT: s_mov_b32 s11, s9 +; SI-NEXT: s_and_b32 s5, s17, 31 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s5 +; SI-NEXT: s_mov_b32 s13, s8 +; SI-NEXT: s_and_b32 s5, s16, 31 +; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], s5 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v4i32: @@ -492,19 +757,25 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s15 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s14 -; VI-NEXT: v_alignbit_b32 v3, s11, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_alignbit_b32 v2, s10, v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s13 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_alignbit_b32 v1, s9, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s12 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_alignbit_b32 v0, s8, v0, v4 +; VI-NEXT: s_mov_b32 s6, s15 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_and_b32 s3, s3, 31 +; VI-NEXT: s_mov_b32 s15, s10 +; VI-NEXT: s_and_b32 s2, s2, 31 +; VI-NEXT: s_mov_b32 s10, s13 +; VI-NEXT: s_mov_b32 s11, s9 +; VI-NEXT: s_and_b32 s1, s1, 31 +; VI-NEXT: s_mov_b32 s13, s8 +; VI-NEXT: s_and_b32 s0, s0, 31 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], s3 +; VI-NEXT: s_lshr_b64 s[2:3], s[14:15], s2 +; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], s1 +; VI-NEXT: s_lshr_b64 s[0:1], s[12:13], s0 ; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s10 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s6 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -516,18 +787,24 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s15 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s14 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_alignbit_b32 v2, s10, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, v5 +; GFX9-NEXT: s_mov_b32 s4, s15 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: s_and_b32 s3, s3, 31 +; GFX9-NEXT: s_mov_b32 s15, s10 +; GFX9-NEXT: s_and_b32 s2, s2, 31 +; GFX9-NEXT: s_mov_b32 s10, s13 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_and_b32 s1, s1, 31 +; GFX9-NEXT: s_mov_b32 s13, s8 +; GFX9-NEXT: s_and_b32 s0, s0, 31 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s3 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[14:15], s2 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s1 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[12:13], s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -552,101 +829,87 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: fshr_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v6, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-NEXT: s_mov_b32 s4, s15 +; GFX10-NEXT: s_mov_b32 s5, s11 +; GFX10-NEXT: s_and_b32 s11, s3, 31 +; GFX10-NEXT: s_mov_b32 s15, s10 +; GFX10-NEXT: s_and_b32 s10, s2, 31 +; GFX10-NEXT: s_mov_b32 s2, s13 +; GFX10-NEXT: s_mov_b32 s3, s9 +; GFX10-NEXT: s_and_b32 s16, s1, 31 +; GFX10-NEXT: s_mov_b32 s13, s8 +; GFX10-NEXT: s_and_b32 s8, s0, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], s11 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[14:15], s10 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s16 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, v0 -; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, v1 -; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, v4 -; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7] +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX10-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: fshr_v4i32: -; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_clause 0x2 -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 -; GFX11-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 -; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v3, s11, s15, v0.l -; GFX11-TRUE16-NEXT: v_alignbit_b32 v2, s10, s14, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, s9, s13, v1.l -; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s8, s12, v4.l -; GFX11-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[4:5] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: fshr_v4i32: -; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 -; GFX11-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 -; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v3, s11, s15, v0 -; GFX11-FAKE16-NEXT: v_alignbit_b32 v2, s10, s14, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s9, s13, v4 -; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s8, s12, v5 -; GFX11-FAKE16-NEXT: global_store_b128 v6, v[0:3], s[4:5] -; GFX11-FAKE16-NEXT: s_endpgm -; -; GFX12-TRUE16-LABEL: fshr_v4i32: -; GFX12-TRUE16: ; %bb.0: ; %entry -; GFX12-TRUE16-NEXT: s_clause 0x2 -; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 -; GFX12-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 -; GFX12-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, 0 -; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, s0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v3, s11, s15, v0.l -; GFX12-TRUE16-NEXT: v_alignbit_b32 v2, s10, s14, v0.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, s9, s13, v1.l -; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s8, s12, v4.l -; GFX12-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[4:5] -; GFX12-TRUE16-NEXT: s_endpgm -; -; GFX12-FAKE16-LABEL: fshr_v4i32: -; GFX12-FAKE16: ; %bb.0: ; %entry -; GFX12-FAKE16-NEXT: s_clause 0x2 -; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 -; GFX12-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 -; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, 0 -; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_alignbit_b32 v3, s11, s15, v0 -; GFX12-FAKE16-NEXT: v_alignbit_b32 v2, s10, s14, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s9, s13, v4 -; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s8, s12, v5 -; GFX12-FAKE16-NEXT: global_store_b128 v6, v[0:3], s[4:5] -; GFX12-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: fshr_v4i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s15 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_and_b32 s11, s3, 31 +; GFX11-NEXT: s_mov_b32 s15, s10 +; GFX11-NEXT: s_and_b32 s10, s2, 31 +; GFX11-NEXT: s_mov_b32 s2, s13 +; GFX11-NEXT: s_mov_b32 s3, s9 +; GFX11-NEXT: s_and_b32 s16, s1, 31 +; GFX11-NEXT: s_mov_b32 s13, s8 +; GFX11-NEXT: s_and_b32 s8, s0, 31 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[6:7], s11 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[14:15], s10 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fshr_v4i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s6, s15 +; GFX12-NEXT: s_mov_b32 s7, s11 +; GFX12-NEXT: s_and_b32 s11, s3, 31 +; GFX12-NEXT: s_mov_b32 s15, s10 +; GFX12-NEXT: s_and_b32 s10, s2, 31 +; GFX12-NEXT: s_mov_b32 s2, s13 +; GFX12-NEXT: s_mov_b32 s3, s9 +; GFX12-NEXT: s_and_b32 s16, s1, 31 +; GFX12-NEXT: s_mov_b32 s13, s8 +; GFX12-NEXT: s_and_b32 s8, s0, 31 +; GFX12-NEXT: s_lshr_b64 s[0:1], s[6:7], s11 +; GFX12-NEXT: s_lshr_b64 s[6:7], s[14:15], s10 +; GFX12-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[2:3], s16 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX12-NEXT: s_endpgm entry: %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) store <4 x i32> %0, ptr addrspace(1) %in @@ -661,14 +924,20 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_mov_b32_e32 v1, s14 -; SI-NEXT: v_alignbit_b32 v3, s11, v0, 1 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_alignbit_b32 v2, s10, v1, 9 -; SI-NEXT: v_alignbit_b32 v1, s9, v0, 7 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; SI-NEXT: s_mov_b32 s4, s15 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_mov_b32 s15, s10 +; SI-NEXT: s_mov_b32 s10, s13 +; SI-NEXT: s_mov_b32 s11, s9 +; SI-NEXT: s_mov_b32 s13, s8 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 +; SI-NEXT: s_lshr_b64 s[6:7], s[14:15], 9 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], 7 +; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -677,15 +946,21 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s15 -; VI-NEXT: v_mov_b32_e32 v1, s14 -; VI-NEXT: v_mov_b32_e32 v4, s13 -; VI-NEXT: v_alignbit_b32 v3, s11, v0, 1 -; VI-NEXT: v_alignbit_b32 v2, s10, v1, 9 -; VI-NEXT: v_alignbit_b32 v1, s9, v4, 7 -; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: s_mov_b32 s2, s15 +; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s10 +; VI-NEXT: s_mov_b32 s6, s13 +; VI-NEXT: s_mov_b32 s7, s9 +; VI-NEXT: s_mov_b32 s13, s8 +; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; VI-NEXT: s_lshr_b64 s[4:5], s[14:15], 9 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 7 +; VI-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -696,14 +971,20 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s15 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_alignbit_b32 v2, s10, v1, 9 -; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, 7 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; GFX9-NEXT: s_mov_b32 s2, s15 +; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s15, s10 +; GFX9-NEXT: s_mov_b32 s6, s13 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s13, s8 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], 9 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], 7 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -730,10 +1011,20 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, 1 -; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, 9 -; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, 7 -; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, 1 +; GFX10-NEXT: s_mov_b32 s2, s15 +; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_mov_b32 s15, s10 +; GFX10-NEXT: s_mov_b32 s4, s13 +; GFX10-NEXT: s_mov_b32 s5, s9 +; GFX10-NEXT: s_mov_b32 s13, s8 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[14:15], 9 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 7 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -742,12 +1033,21 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v3, s11, s15, 1 -; GFX11-NEXT: v_alignbit_b32 v2, s10, s14, 9 -; GFX11-NEXT: v_alignbit_b32 v1, s9, s13, 7 -; GFX11-NEXT: v_alignbit_b32 v0, s8, s12, 1 +; GFX11-NEXT: s_mov_b32 s2, s15 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s15, s10 +; GFX11-NEXT: s_mov_b32 s4, s13 +; GFX11-NEXT: s_mov_b32 s5, s9 +; GFX11-NEXT: s_mov_b32 s13, s8 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[14:15], 9 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -756,12 +1056,21 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_alignbit_b32 v3, s11, s15, 1 -; GFX12-NEXT: v_alignbit_b32 v2, s10, s14, 9 -; GFX12-NEXT: v_alignbit_b32 v1, s9, s13, 7 -; GFX12-NEXT: v_alignbit_b32 v0, s8, s12, 1 +; GFX12-NEXT: s_mov_b32 s2, s15 +; GFX12-NEXT: s_mov_b32 s3, s11 +; GFX12-NEXT: s_mov_b32 s15, s10 +; GFX12-NEXT: s_mov_b32 s4, s13 +; GFX12-NEXT: s_mov_b32 s5, s9 +; GFX12-NEXT: s_mov_b32 s13, s8 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX12-NEXT: s_lshr_b64 s[6:7], s[14:15], 9 +; GFX12-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; GFX12-NEXT: s_lshr_b64 s[4:5], s[4:5], 7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -770,6 +1079,194 @@ entry: ret void } +define amdgpu_kernel void @fshr_v4i32_imm_src0(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) { +; SI-LABEL: fshr_v4i32_imm_src0: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 33 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, s11 +; SI-NEXT: s_and_b32 s4, s15, 31 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s4 +; SI-NEXT: s_mov_b32 s11, 9 +; SI-NEXT: s_and_b32 s5, s14, 31 +; SI-NEXT: s_lshr_b64 s[6:7], s[10:11], s5 +; SI-NEXT: s_mov_b32 s11, 7 +; SI-NEXT: s_mov_b32 s10, s9 +; SI-NEXT: s_and_b32 s5, s13, 31 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s5 +; SI-NEXT: s_mov_b32 s9, 1 +; SI-NEXT: s_and_b32 s5, s12, 31 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s5 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fshr_v4i32_imm_src0: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s1, 33 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s11 +; VI-NEXT: s_and_b32 s4, s15, 31 +; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; VI-NEXT: s_mov_b32 s11, 9 +; VI-NEXT: s_and_b32 s1, s14, 31 +; VI-NEXT: s_lshr_b64 s[4:5], s[10:11], s1 +; VI-NEXT: s_mov_b32 s6, s9 +; VI-NEXT: s_and_b32 s1, s13, 31 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 +; VI-NEXT: s_mov_b32 s9, 1 +; VI-NEXT: s_and_b32 s1, s12, 31 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fshr_v4i32_imm_src0: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s1, 33 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 7 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s11 +; GFX9-NEXT: s_and_b32 s4, s15, 31 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX9-NEXT: s_mov_b32 s11, 9 +; GFX9-NEXT: s_and_b32 s1, s14, 31 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[10:11], s1 +; GFX9-NEXT: s_mov_b32 s6, s9 +; GFX9-NEXT: s_and_b32 s1, s13, 31 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 +; GFX9-NEXT: s_mov_b32 s9, 1 +; GFX9-NEXT: s_and_b32 s1, s12, 31 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-NEXT: s_endpgm +; +; R600-LABEL: fshr_v4i32_imm_src0: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: BIT_ALIGN_INT * T0.W, literal.x, KC0[4].X, KC0[5].X, +; R600-NEXT: 33(4.624285e-44), 0(0.000000e+00) +; R600-NEXT: BIT_ALIGN_INT * T0.Z, literal.x, KC0[3].W, KC0[4].W, +; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) +; R600-NEXT: BIT_ALIGN_INT * T0.Y, literal.x, KC0[3].Z, KC0[4].Z, +; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) +; R600-NEXT: BIT_ALIGN_INT * T0.X, 1, KC0[3].Y, KC0[4].Y, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: fshr_v4i32_imm_src0: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s1, 33 +; GFX10-NEXT: s_mov_b32 s3, 7 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s0, s11 +; GFX10-NEXT: s_and_b32 s4, s15, 31 +; GFX10-NEXT: s_mov_b32 s11, 9 +; GFX10-NEXT: s_and_b32 s5, s14, 31 +; GFX10-NEXT: s_mov_b32 s2, s9 +; GFX10-NEXT: s_and_b32 s13, s13, 31 +; GFX10-NEXT: s_mov_b32 s9, 1 +; GFX10-NEXT: s_and_b32 s12, s12, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[10:11], s5 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fshr_v4i32_imm_src0: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s1, 33 +; GFX11-NEXT: s_mov_b32 s3, 7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, s11 +; GFX11-NEXT: s_and_b32 s6, s15, 31 +; GFX11-NEXT: s_mov_b32 s11, 9 +; GFX11-NEXT: s_and_b32 s7, s14, 31 +; GFX11-NEXT: s_mov_b32 s2, s9 +; GFX11-NEXT: s_and_b32 s13, s13, 31 +; GFX11-NEXT: s_mov_b32 s9, 1 +; GFX11-NEXT: s_and_b32 s12, s12, 31 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s6 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[10:11], s7 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fshr_v4i32_imm_src0: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s1, 33 +; GFX12-NEXT: s_mov_b32 s3, 7 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s0, s11 +; GFX12-NEXT: s_and_b32 s6, s15, 31 +; GFX12-NEXT: s_mov_b32 s11, 9 +; GFX12-NEXT: s_and_b32 s7, s14, 31 +; GFX12-NEXT: s_mov_b32 s2, s9 +; GFX12-NEXT: s_and_b32 s13, s13, 31 +; GFX12-NEXT: s_mov_b32 s9, 1 +; GFX12-NEXT: s_and_b32 s12, s12, 31 +; GFX12-NEXT: s_lshr_b64 s[0:1], s[0:1], s6 +; GFX12-NEXT: s_lshr_b64 s[6:7], s[10:11], s7 +; GFX12-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX12-NEXT: s_endpgm +entry: + %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> , <4 x i32> %x, <4 x i32> %y) + store <4 x i32> %0, ptr addrspace(1) %in + ret void +} + define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) { ; GFX89-LABEL: v_fshr_i32: ; GFX89: ; %bb.0: @@ -2091,29 +2588,109 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshr_v2i24: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4 -; GFX11-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_u32_u24_e32 v6, 24, v6 -; GFX11-NEXT: v_mul_u32_u24_e32 v7, 24, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_nc_u32_e32 v4, v4, v6 -; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 8, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 8, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshr_v2i24: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4 +; GFX11-TRUE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6 +; GFX11-TRUE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v2, v4.l +; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, v1, v3, v5.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshr_v2i24: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4 +; GFX11-FAKE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6 +; GFX11-FAKE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: v_fshr_v2i24: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4 +; GFX12-TRUE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6 +; GFX12-TRUE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX12-TRUE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, v0, v2, v4.l +; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, v1, v3, v5.l +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: v_fshr_v2i24: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4 +; GFX12-FAKE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6 +; GFX12-FAKE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX12-FAKE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) ret <2 x i24> %ret } diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 76016e46426bd..92ea83fdfb982 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -238,11 +238,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_alignbit_b32 v2, s0, v2, 16 +; VI-NEXT: s_lshr_b32 s5, s2, 16 +; VI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -256,11 +256,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s0, s2, 16 -; CI-NEXT: v_alignbit_b32 v2, s0, v2, 16 +; CI-NEXT: s_lshr_b32 s5, s2, 16 +; CI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -312,16 +312,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_lshr_b32 s3, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; VI-NEXT: s_lshr_b32 s5, s2, 16 +; VI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: ; use s0 +; VI-NEXT: ; use s3 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: s_endpgm ; @@ -334,16 +334,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_lshr_b32 s0, s4, 16 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_lshr_b32 s3, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s1, s2, 16 -; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; CI-NEXT: s_lshr_b32 s5, s2, 16 +; CI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: ;;#ASMSTART -; CI-NEXT: ; use s0 +; CI-NEXT: ; use s3 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm ; @@ -405,19 +405,19 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_lshr_b32 s3, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; VI-NEXT: s_lshr_b32 s5, s2, 16 +; VI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: ; use s0 +; VI-NEXT: ; use s3 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: ; use s1 +; VI-NEXT: ; use s5 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: s_endpgm ; @@ -430,19 +430,19 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_lshr_b32 s0, s4, 16 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_lshr_b32 s3, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s1, s2, 16 -; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; CI-NEXT: s_lshr_b32 s5, s2, 16 +; CI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: ;;#ASMSTART -; CI-NEXT: ; use s0 +; CI-NEXT: ; use s3 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: ;;#ASMSTART -; CI-NEXT: ; use s1 +; CI-NEXT: ; use s5 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index 6f63384be90fd..2d60c5729ed52 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -9775,17 +9775,17 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_and_b32 s5, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s4, 24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s6, v0, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s5, s5, 8 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NOHSA-NEXT: s_lshr_b32 s5, s4, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s2, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s6, s4, 0xff +; GFX6-NOHSA-NEXT: s_lshl_b32 s2, s2, 8 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GFX6-NOHSA-NEXT: s_or_b32 s5, s6, s2 +; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -9800,15 +9800,15 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-HSA-NEXT: s_lshr_b32 s3, s2, 24 ; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-HSA-NEXT: s_and_b32 s2, s2, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s0, s0, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v2, s1, v2, 16 -; GFX7-HSA-NEXT: s_or_b32 s0, s2, s0 -; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-HSA-NEXT: s_and_b32 s4, s2, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s5, s0, 8 +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16 +; GFX7-HSA-NEXT: s_or_b32 s1, s4, s5 +; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -9820,15 +9820,15 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 8 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v2, s0, v2, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s2, 24 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s2, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s4, s2, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s5, s0, 0xff0000 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16 +; GFX8-NOHSA-NEXT: s_or_b32 s1, s4, s5 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -10062,26 +10062,28 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_and_b32 s6, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s7, s4, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s8, s5, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s5, 24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s9, v0, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s2, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s7, s5, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s8, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s9, s5, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s10, s5, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s11, s4, 0xff +; GFX6-NOHSA-NEXT: s_mov_b32 s6, s5 +; GFX6-NOHSA-NEXT: s_lshl_b32 s9, s9, 8 ; GFX6-NOHSA-NEXT: s_lshl_b32 s8, s8, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s7, v1, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s6, s6, 8 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s8 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s6 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s5, s2 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GFX6-NOHSA-NEXT: s_or_b32 s5, s10, s9 +; GFX6-NOHSA-NEXT: s_or_b32 s7, s11, s8 +; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -10096,24 +10098,26 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s5, s3, 24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s5, v0, 16 ; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 24 -; GFX7-HSA-NEXT: s_and_b32 s4, s3, 0xff00 -; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: s_and_b32 s3, s3, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s4, s4, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s1, v0, 16 -; GFX7-HSA-NEXT: s_and_b32 s1, s2, 0xff +; GFX7-HSA-NEXT: s_and_b32 s5, s3, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s6, s3, 0xff +; GFX7-HSA-NEXT: s_and_b32 s7, s2, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s5, s5, 8 ; GFX7-HSA-NEXT: s_lshl_b32 s0, s0, 8 -; GFX7-HSA-NEXT: s_or_b32 s3, s3, s4 -; GFX7-HSA-NEXT: s_or_b32 s0, s1, s0 -; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-HSA-NEXT: s_lshr_b32 s4, s2, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s1, s3, 24 +; GFX7-HSA-NEXT: s_or_b32 s5, s6, s5 +; GFX7-HSA-NEXT: s_or_b32 s6, s7, s0 +; GFX7-HSA-NEXT: s_mov_b32 s0, s3 +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GFX7-HSA-NEXT: s_mov_b32 s3, s4 +; GFX7-HSA-NEXT: s_and_b32 s7, s0, 0xff00ff +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16 +; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -10122,28 +10126,29 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s3, 24 ; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s3, 0x80010 ; GFX8-NOHSA-NEXT: s_and_b32 s5, s3, 0xff ; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 8 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s0, v0, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NOHSA-NEXT: s_or_b32 s5, s5, s3 +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s2, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24 +; GFX8-NOHSA-NEXT: s_or_b32 s4, s4, s1 +; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000 -; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s1, s4, s1 -; GFX8-NOHSA-NEXT: s_or_b32 s3, s5, s3 -; GFX8-NOHSA-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NOHSA-NEXT: s_or_b32 s6, s1, s3 +; GFX8-NOHSA-NEXT: s_mov_b32 s3, s0 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -10500,43 +10505,48 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_and_b32 s8, s6, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s6, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s10, s7, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s11, s7, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s12, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s4, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s14, s5, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s5, 24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s7, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s13, s6, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s14, s7, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s11, s5, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s16, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s17, s5, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s18, s5, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s19, s4, 0xff +; GFX6-NOHSA-NEXT: s_mov_b32 s10, s5 +; GFX6-NOHSA-NEXT: s_and_b32 s20, s7, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s21, s6, 0xff +; GFX6-NOHSA-NEXT: s_mov_b32 s8, s7 +; GFX6-NOHSA-NEXT: s_lshl_b32 s17, s17, 8 +; GFX6-NOHSA-NEXT: s_lshl_b32 s16, s16, 8 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[10:11], s[10:11], 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s5, s15 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GFX6-NOHSA-NEXT: s_lshl_b32 s5, s14, 8 +; GFX6-NOHSA-NEXT: s_lshl_b32 s11, s13, 8 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s7, s12 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GFX6-NOHSA-NEXT: s_or_b32 s7, s18, s17 +; GFX6-NOHSA-NEXT: s_or_b32 s9, s19, s16 +; GFX6-NOHSA-NEXT: s_and_b32 s10, s10, 0xff00ff +; GFX6-NOHSA-NEXT: s_or_b32 s5, s20, s5 +; GFX6-NOHSA-NEXT: s_or_b32 s11, s21, s11 +; GFX6-NOHSA-NEXT: s_and_b32 s8, s8, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s8 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s6 -; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s15, v0, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s14, s14, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s13, v1, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v2, s11, v2, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s10, s10, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v4, s9, v3, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s8, s8, 8 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s14 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s12 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v2 -; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s10 -; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s8 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -10549,48 +10559,52 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s13, s5, 24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s13, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s11, s4, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s11, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s9, s7, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s9, v0, 16 -; GFX7-HSA-NEXT: s_and_b32 s2, s6, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s3, s6, 24 -; GFX7-HSA-NEXT: s_and_b32 s8, s7, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s10, s4, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s12, s5, 0xff00 -; GFX7-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s12, s12, 8 -; GFX7-HSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s10, s10, 8 -; GFX7-HSA-NEXT: s_and_b32 s7, s7, 0xff +; GFX7-HSA-NEXT: s_and_b32 s13, s5, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s8, s4, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s14, s5, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s13, s13, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s12, s4, 24 +; GFX7-HSA-NEXT: s_or_b32 s13, s14, s13 +; GFX7-HSA-NEXT: s_and_b32 s14, s4, 0xff ; GFX7-HSA-NEXT: s_lshl_b32 s8, s8, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s3, v0, 16 -; GFX7-HSA-NEXT: s_and_b32 s3, s6, 0xff +; GFX7-HSA-NEXT: s_lshr_b32 s9, s5, 24 +; GFX7-HSA-NEXT: s_or_b32 s14, s14, s8 +; GFX7-HSA-NEXT: s_mov_b32 s8, s5 +; GFX7-HSA-NEXT: s_mov_b32 s5, s12 +; GFX7-HSA-NEXT: s_and_b32 s11, s7, 0xff00 +; GFX7-HSA-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GFX7-HSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GFX7-HSA-NEXT: s_and_b32 s2, s6, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s5, s7, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s9, s11, 8 +; GFX7-HSA-NEXT: s_or_b32 s5, s5, s9 +; GFX7-HSA-NEXT: s_and_b32 s9, s6, 0xff ; GFX7-HSA-NEXT: s_lshl_b32 s2, s2, 8 -; GFX7-HSA-NEXT: s_or_b32 s5, s5, s12 -; GFX7-HSA-NEXT: s_or_b32 s4, s4, s10 -; GFX7-HSA-NEXT: s_or_b32 s7, s7, s8 -; GFX7-HSA-NEXT: s_or_b32 s2, s3, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s3, s7, 24 +; GFX7-HSA-NEXT: s_or_b32 s9, s9, s2 +; GFX7-HSA-NEXT: s_mov_b32 s2, s7 +; GFX7-HSA-NEXT: s_lshr_b64 s[2:3], s[2:3], 16 +; GFX7-HSA-NEXT: s_mov_b32 s7, s10 +; GFX7-HSA-NEXT: s_and_b32 s11, s2, 0xff00ff +; GFX7-HSA-NEXT: s_lshr_b64 s[2:3], s[6:7], 16 +; GFX7-HSA-NEXT: s_and_b32 s8, s8, 0xff00ff +; GFX7-HSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GFX7-HSA-NEXT: s_and_b32 s2, s2, 0xff00ff +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GFX7-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -10601,50 +10615,52 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s4, 24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s3, v0, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s3, s4, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s4, s4, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s5, 24 -; GFX8-NOHSA-NEXT: s_and_b32 s4, s4, 0xff0000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s5, 24 ; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s5, 0x80010 -; GFX8-NOHSA-NEXT: s_lshl_b32 s8, s8, 16 -; GFX8-NOHSA-NEXT: s_or_b32 s4, s3, s4 -; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s7, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s6, 24 -; GFX8-NOHSA-NEXT: s_or_b32 s8, s9, s8 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s7, 0x80010 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NOHSA-NEXT: s_and_b32 s10, s5, 0xff ; GFX8-NOHSA-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NOHSA-NEXT: s_or_b32 s3, s9, s3 -; GFX8-NOHSA-NEXT: s_and_b32 s9, s7, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s2, v0, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s2, s6, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s4, 24 ; GFX8-NOHSA-NEXT: s_and_b32 s5, s5, 0xff0000 -; GFX8-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000 -; GFX8-NOHSA-NEXT: s_and_b32 s6, s6, 0xff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s5, s10, s5 -; GFX8-NOHSA-NEXT: s_or_b32 s7, s9, s7 -; GFX8-NOHSA-NEXT: s_or_b32 s2, s2, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NOHSA-NEXT: s_or_b32 s9, s9, s3 +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s4, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s11, s4, 0xff +; GFX8-NOHSA-NEXT: s_or_b32 s10, s10, s5 +; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000 +; GFX8-NOHSA-NEXT: s_mov_b32 s5, s2 +; GFX8-NOHSA-NEXT: s_or_b32 s11, s11, s3 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[2:3], s[4:5], 16 +; GFX8-NOHSA-NEXT: s_and_b32 s4, s2, 0xff00ff +; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s7, 24 +; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s7, 0x80010 +; GFX8-NOHSA-NEXT: s_or_b32 s5, s3, s2 +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s7, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s7, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s6, 24 +; GFX8-NOHSA-NEXT: s_or_b32 s12, s2, s3 +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s6, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s6, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000 +; GFX8-NOHSA-NEXT: s_mov_b32 s7, s8 +; GFX8-NOHSA-NEXT: s_or_b32 s13, s2, s3 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[2:3], s[6:7], 16 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xff00ff +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -11272,81 +11288,92 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_and_b32 s12, s6, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s6, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s14, s7, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s7, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s16, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s17, s4, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s18, s5, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s19, s5, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s20, s2, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s21, s2, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s22, s3, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s23, s3, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s24, s0, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s25, s0, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s26, s1, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s27, s1, 24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-NOHSA-NEXT: s_and_b32 s1, s1, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NOHSA-NEXT: s_and_b32 s0, s0, 0xff +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s7, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s21, s6, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s22, s7, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s23, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s5, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s24, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s25, s5, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s2, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s17, s3, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s27, s2, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s28, s3, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s29, s0, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s19, s1, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s30, s0, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s31, s1, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s33, s1, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s34, s0, 0xff +; GFX6-NOHSA-NEXT: s_mov_b32 s18, s1 +; GFX6-NOHSA-NEXT: s_and_b32 s35, s3, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s36, s2, 0xff +; GFX6-NOHSA-NEXT: s_mov_b32 s16, s3 +; GFX6-NOHSA-NEXT: s_and_b32 s37, s5, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s38, s4, 0xff +; GFX6-NOHSA-NEXT: s_mov_b32 s14, s5 +; GFX6-NOHSA-NEXT: s_and_b32 s39, s7, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s40, s6, 0xff +; GFX6-NOHSA-NEXT: s_mov_b32 s12, s7 +; GFX6-NOHSA-NEXT: s_lshl_b32 s31, s31, 8 +; GFX6-NOHSA-NEXT: s_lshl_b32 s30, s30, 8 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[18:19], s[18:19], 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s1, s29 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GFX6-NOHSA-NEXT: s_lshl_b32 s1, s28, 8 +; GFX6-NOHSA-NEXT: s_lshl_b32 s19, s27, 8 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[16:17], s[16:17], 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s3, s26 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[2:3], s[2:3], 16 +; GFX6-NOHSA-NEXT: s_lshl_b32 s3, s25, 8 +; GFX6-NOHSA-NEXT: s_lshl_b32 s17, s24, 8 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[14:15], s[14:15], 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s5, s23 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GFX6-NOHSA-NEXT: s_lshl_b32 s5, s22, 8 +; GFX6-NOHSA-NEXT: s_lshl_b32 s15, s21, 8 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[12:13], s[12:13], 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s7, s20 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GFX6-NOHSA-NEXT: s_or_b32 s7, s33, s31 +; GFX6-NOHSA-NEXT: s_or_b32 s13, s34, s30 +; GFX6-NOHSA-NEXT: s_and_b32 s18, s18, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX6-NOHSA-NEXT: s_or_b32 s1, s35, s1 +; GFX6-NOHSA-NEXT: s_or_b32 s19, s36, s19 +; GFX6-NOHSA-NEXT: s_and_b32 s16, s16, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s2, s2, 0xff00ff +; GFX6-NOHSA-NEXT: s_or_b32 s3, s37, s3 +; GFX6-NOHSA-NEXT: s_or_b32 s17, s38, s17 +; GFX6-NOHSA-NEXT: s_and_b32 s14, s14, 0xff00ff +; GFX6-NOHSA-NEXT: s_or_b32 s5, s39, s5 +; GFX6-NOHSA-NEXT: s_or_b32 s15, s40, s15 +; GFX6-NOHSA-NEXT: s_and_b32 s12, s12, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-NOHSA-NEXT: s_and_b32 s3, s3, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s2 -; GFX6-NOHSA-NEXT: s_and_b32 s2, s2, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s5 -; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s4 -; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s7 -; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s6 -; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s27, v0, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s26, s26, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s25, v1, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s24, s24, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v2, s23, v2, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s22, s22, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v8, s21, v3, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s20, s20, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v4, s19, v4, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s18, s18, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v9, s17, v5, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s16, s16, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v6, s15, v6, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s14, s14, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v10, s13, v7, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 8 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX6-NOHSA-NEXT: s_or_b32 s1, s1, s26 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 -; GFX6-NOHSA-NEXT: s_or_b32 s0, s0, s24 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v2 -; GFX6-NOHSA-NEXT: s_or_b32 s3, s3, s22 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v8 -; GFX6-NOHSA-NEXT: s_or_b32 s2, s2, s20 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v11, 0xff00ff, v4 -; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s18 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v9 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s16 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v15, 0xff00ff, v6 -; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s14 -; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s12 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v13, 0xff00ff, v10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s3 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s14 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s16 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s18 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -11354,99 +11381,106 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s25, s1, 24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s25, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s23, s0, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s23, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s21, s3, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s21, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s19, s2, 24 -; GFX7-HSA-NEXT: s_and_b32 s24, s1, 0xff00 -; GFX7-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: s_and_b32 s22, s0, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s1, s1, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s24, s24, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s19, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s17, s5, 24 +; GFX7-HSA-NEXT: s_and_b32 s22, s1, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s12, s0, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s23, s1, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s22, s22, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s21, s0, 24 +; GFX7-HSA-NEXT: s_or_b32 s22, s23, s22 +; GFX7-HSA-NEXT: s_and_b32 s23, s0, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s12, s12, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s13, s1, 24 +; GFX7-HSA-NEXT: s_or_b32 s23, s23, s12 +; GFX7-HSA-NEXT: s_mov_b32 s12, s1 +; GFX7-HSA-NEXT: s_mov_b32 s1, s21 ; GFX7-HSA-NEXT: s_and_b32 s20, s3, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s24, s1, s24 -; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s22, 8 -; GFX7-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-HSA-NEXT: s_and_b32 s18, s2, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s22, s0, s1 +; GFX7-HSA-NEXT: s_lshr_b64 s[12:13], s[12:13], 16 +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GFX7-HSA-NEXT: s_and_b32 s19, s2, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s13, s0, 0xff00ff ; GFX7-HSA-NEXT: s_and_b32 s0, s3, 0xff ; GFX7-HSA-NEXT: s_lshl_b32 s1, s20, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s17, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s15, s4, 24 -; GFX7-HSA-NEXT: s_and_b32 s16, s5, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s3, s0, s1 +; GFX7-HSA-NEXT: s_or_b32 s20, s0, s1 ; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s18, 8 -; GFX7-HSA-NEXT: v_and_b32_e32 v11, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: s_and_b32 s14, s4, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s2, s0, s1 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s19, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s2, 24 +; GFX7-HSA-NEXT: s_or_b32 s19, s0, s1 +; GFX7-HSA-NEXT: s_lshr_b32 s1, s3, 24 +; GFX7-HSA-NEXT: s_mov_b32 s0, s3 +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GFX7-HSA-NEXT: s_mov_b32 s3, s18 +; GFX7-HSA-NEXT: s_and_b32 s17, s5, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s21, s0, 0xff00ff +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16 +; GFX7-HSA-NEXT: s_and_b32 s10, s4, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s2, s0, 0xff00ff ; GFX7-HSA-NEXT: s_and_b32 s0, s5, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s16, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s15, v0, 16 -; GFX7-HSA-NEXT: s_and_b32 s12, s7, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s13, s7, 24 -; GFX7-HSA-NEXT: s_or_b32 s5, s0, s1 -; GFX7-HSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v0 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s17, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s16, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s11, s5, 24 +; GFX7-HSA-NEXT: s_or_b32 s3, s0, s1 ; GFX7-HSA-NEXT: s_and_b32 s0, s4, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s14, 8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-HSA-NEXT: s_and_b32 s10, s6, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s4, s0, s1 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s13, v0, 16 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s10, 8 +; GFX7-HSA-NEXT: s_mov_b32 s10, s5 +; GFX7-HSA-NEXT: s_or_b32 s17, s0, s1 +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[10:11], 16 +; GFX7-HSA-NEXT: s_mov_b32 s5, s16 +; GFX7-HSA-NEXT: s_and_b32 s15, s7, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s10, s0, 0xff00ff +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; GFX7-HSA-NEXT: s_and_b32 s14, s6, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s4, s0, 0xff00ff ; GFX7-HSA-NEXT: s_and_b32 s0, s7, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s12, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s11, s6, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v15, 0xff00ff, v0 -; GFX7-HSA-NEXT: s_or_b32 s0, s0, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: s_and_b32 s1, s6, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s6, s10, 8 -; GFX7-HSA-NEXT: s_or_b32 s1, s1, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s15, 8 +; GFX7-HSA-NEXT: s_or_b32 s5, s0, s1 +; GFX7-HSA-NEXT: s_and_b32 s0, s6, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s1, s14, 8 +; GFX7-HSA-NEXT: s_or_b32 s11, s0, s1 +; GFX7-HSA-NEXT: s_lshr_b32 s1, s7, 24 +; GFX7-HSA-NEXT: s_mov_b32 s0, s7 +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GFX7-HSA-NEXT: s_lshr_b32 s7, s6, 24 +; GFX7-HSA-NEXT: s_and_b32 s14, s0, 0xff00ff +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[6:7], 16 +; GFX7-HSA-NEXT: s_and_b32 s12, s12, 0xff00ff +; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s11, v0, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 -; GFX7-HSA-NEXT: v_and_b32_e32 v13, 0xff00ff, v0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s14 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s17 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -11463,90 +11497,94 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NOHSA-NEXT: s_lshl_b32 s14, s14, 16 ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 -; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s0, 24 ; GFX8-NOHSA-NEXT: s_or_b32 s14, s15, s14 ; GFX8-NOHSA-NEXT: s_or_b32 s15, s16, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NOHSA-NEXT: s_lshl_b32 s16, s0, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s0, 24 ; GFX8-NOHSA-NEXT: s_and_b32 s1, s0, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 8 -; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff0000 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s13, v0, 16 -; GFX8-NOHSA-NEXT: s_or_b32 s13, s1, s0 +; GFX8-NOHSA-NEXT: s_and_b32 s16, s16, 0xff0000 +; GFX8-NOHSA-NEXT: s_or_b32 s16, s1, s16 +; GFX8-NOHSA-NEXT: s_mov_b32 s1, s13 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GFX8-NOHSA-NEXT: s_and_b32 s13, s0, 0xff00ff ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s3, 24 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s3, 0x80010 -; GFX8-NOHSA-NEXT: s_or_b32 s16, s1, s0 +; GFX8-NOHSA-NEXT: s_or_b32 s17, s1, s0 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s3, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s3, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s3, s0, s1 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 +; GFX8-NOHSA-NEXT: s_or_b32 s18, s0, s1 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s2, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NOHSA-NEXT: s_or_b32 s2, s0, s1 +; GFX8-NOHSA-NEXT: s_mov_b32 s3, s12 +; GFX8-NOHSA-NEXT: s_or_b32 s19, s0, s1 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s0, 0xff00ff ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s5, 24 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s5, 0x80010 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s12, v0, 16 -; GFX8-NOHSA-NEXT: s_or_b32 s12, s1, s0 +; GFX8-NOHSA-NEXT: s_or_b32 s3, s1, s0 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s5, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s5, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s5, s0, s1 +; GFX8-NOHSA-NEXT: s_lshr_b32 s11, s4, 24 +; GFX8-NOHSA-NEXT: s_or_b32 s12, s0, s1 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s4, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s4, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 -; GFX8-NOHSA-NEXT: s_lshr_b32 s11, s4, 24 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: s_or_b32 s4, s0, s1 +; GFX8-NOHSA-NEXT: s_mov_b32 s5, s11 +; GFX8-NOHSA-NEXT: s_or_b32 s20, s0, s1 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; GFX8-NOHSA-NEXT: s_and_b32 s4, s0, 0xff00ff ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s7, 24 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s7, 0x80010 -; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0 -; GFX8-NOHSA-NEXT: s_and_b32 s1, s7, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s11, v0, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000 +; GFX8-NOHSA-NEXT: s_or_b32 s5, s1, s0 +; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s7, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s7, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 ; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s6, 24 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: s_or_b32 s1, s1, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NOHSA-NEXT: s_and_b32 s7, s6, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NOHSA-NEXT: s_and_b32 s6, s6, 0xff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s6, s7, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s0 +; GFX8-NOHSA-NEXT: s_or_b32 s11, s0, s1 +; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s6, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s6, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 +; GFX8-NOHSA-NEXT: s_mov_b32 s7, s10 +; GFX8-NOHSA-NEXT: s_or_b32 s21, s0, s1 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[6:7], 16 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s10, v0, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s21 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s19 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll index cb17f01853221..0c399d65d01cc 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll @@ -9828,14 +9828,14 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00, v0 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 24, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00, v0 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff, v0 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v3, v2 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NOHSA-SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00ff, v0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v4i8_to_v4i16: @@ -9847,18 +9847,18 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dword v2, v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: flat_load_dword v0, v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff, v2 -; GCN-HSA-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2 -; GCN-HSA-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 24, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff, v0 +; GCN-HSA-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GCN-HSA-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0 +; GCN-HSA-NEXT: flat_store_dwordx2 v[2:3], v[4:5] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v4i8_to_v4i16: @@ -9877,10 +9877,10 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xff0000, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xff0000, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b64 v[1:2], 16, v[0:1] +; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -10179,33 +10179,39 @@ define amdgpu_kernel void @global_sextload_v4i8_to_v4i16(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v8i8_to_v8i16: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00, v0 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff, v0 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v1, v5, v1, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v6, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v7, v5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s1, s0, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s0, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s6, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s6, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s0, 0xff +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s9, s9, 8 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s8, s8, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GCN-NOHSA-SI-NEXT: s_or_b32 s7, s10, s9 +; GCN-NOHSA-SI-NEXT: s_or_b32 s8, s11, s8 +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s0, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v8i8_to_v8i16: @@ -10221,20 +10227,26 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xff00, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff, v0 -; GCN-HSA-NEXT: v_alignbit_b32 v1, v7, v1, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GCN-HSA-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v7, 8, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-HSA-NEXT: v_or_b32_e32 v2, v8, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-HSA-NEXT: v_or_b32_e32 v0, v9, v7 +; GCN-HSA-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v1 +; GCN-HSA-NEXT: s_lshr_b32 s1, s0, 24 +; GCN-HSA-NEXT: s_lshr_b32 s3, s2, 24 +; GCN-HSA-NEXT: s_and_b32 s4, s0, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s5, s2, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s6, s2, 0xff +; GCN-HSA-NEXT: s_and_b32 s7, s0, 0xff +; GCN-HSA-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-HSA-NEXT: s_lshl_b32 s4, s4, 8 +; GCN-HSA-NEXT: s_lshr_b64 s[2:3], s[2:3], 16 +; GCN-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GCN-HSA-NEXT: s_or_b32 s1, s6, s5 +; GCN-HSA-NEXT: s_or_b32 s3, s7, s4 +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xff00ff +; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; @@ -10252,22 +10264,26 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v1 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v1 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s4, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s6, s4, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s4, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v2, v2, v0, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xff0000, v1 -; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s6, s5 -; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s7, s4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v2 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s8, s6, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s6, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 8 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s4, 8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s4, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s8, s7 +; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s9, s6 +; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s11 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -10763,35 +10779,48 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff00, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff00, v0 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 24, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xff00, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 24, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v13, 0xff, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xff, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xff, v2 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v1, v11, v1, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v9, v0, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v7, v7, v3, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v5, v5, v2, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v12, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v13, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v7 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v14, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v5 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v15, v4 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v3 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s5, s4, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s4, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s6, 0xff00 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s8, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s10, 24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s8, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s10, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s10, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s17, s8, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s6, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s4, 0xff +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s15, s15, 8 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s14, s14, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[10:11], s[10:11], 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s9, s13, 8 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s11, s12, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-NOHSA-SI-NEXT: s_or_b32 s5, s16, s15 +; GCN-NOHSA-SI-NEXT: s_or_b32 s7, s17, s14 +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_or_b32 s9, s18, s9 +; GCN-NOHSA-SI-NEXT: s_or_b32 s11, s19, s11 +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xff00ff +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s10 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -10805,43 +10834,55 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff00, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff00, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xff, v2 -; GCN-HSA-NEXT: v_alignbit_b32 v7, v7, v3, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GCN-HSA-NEXT: v_alignbit_b32 v5, v5, v2, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v0 -; GCN-HSA-NEXT: v_alignbit_b32 v1, v15, v1, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GCN-HSA-NEXT: v_alignbit_b32 v0, v13, v0, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v7 -; GCN-HSA-NEXT: v_or_b32_e32 v6, v18, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v5 -; GCN-HSA-NEXT: v_or_b32_e32 v4, v19, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-HSA-NEXT: v_or_b32_e32 v2, v16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-HSA-NEXT: v_or_b32_e32 v0, v17, v12 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v1 +; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v3 +; GCN-HSA-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-HSA-NEXT: s_lshr_b32 s9, s8, 24 +; GCN-HSA-NEXT: s_lshr_b32 s3, s2, 24 +; GCN-HSA-NEXT: s_lshr_b32 s5, s4, 24 +; GCN-HSA-NEXT: s_and_b32 s10, s2, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s11, s4, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s12, s6, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s13, s8, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s14, s8, 0xff +; GCN-HSA-NEXT: s_and_b32 s15, s6, 0xff +; GCN-HSA-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GCN-HSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-HSA-NEXT: s_and_b32 s16, s4, 0xff +; GCN-HSA-NEXT: s_and_b32 s17, s2, 0xff +; GCN-HSA-NEXT: s_lshl_b32 s13, s13, 8 +; GCN-HSA-NEXT: s_lshl_b32 s12, s12, 8 +; GCN-HSA-NEXT: s_lshl_b32 s7, s11, 8 +; GCN-HSA-NEXT: s_lshl_b32 s9, s10, 8 +; GCN-HSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-HSA-NEXT: s_lshr_b64 s[2:3], s[2:3], 16 +; GCN-HSA-NEXT: s_or_b32 s3, s14, s13 +; GCN-HSA-NEXT: s_or_b32 s5, s15, s12 +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xff00ff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-HSA-NEXT: s_or_b32 s7, s16, s7 +; GCN-HSA-NEXT: s_or_b32 s9, s17, s9 +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xff00ff +; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i8_to_v16i16: @@ -10858,42 +10899,50 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v3 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s5, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s4, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s10, s4, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s4, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v5, 8, v2 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s7, s5, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s5, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xff0000, v1 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v4, v4, v2, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v7, 0xff0000, v5 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xff0000 -; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s7, s6 -; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s9 -; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s11, s4 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v3, v3, v0, 16 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v5, 0xff00ff, v4 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v4, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s8, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v3 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s4, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s9, 24 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s6, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s6, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s8, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s16, s8, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s8, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 8 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s19, s4, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s11, s9, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s9, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s4, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s15, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s19, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s11, s10 +; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s16, s7 +; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s17, s8 +; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s18, s15 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s12, s9 +; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s13, s14 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v16i8_to_v16i16: @@ -11766,71 +11815,97 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v6, v5 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v7, v4 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v3 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v2 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v2 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v3 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s10, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00, v10 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 24, v10 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s12, v6 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s14, v7 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s16, v4 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s18, v5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s5, s4, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s20, s4, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s21, s6, 0xff00 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s8, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s10, 24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s8, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s23, s10, 0xff00 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s12, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s14, 24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s24, s12, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s25, s14, 0xff00 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s16, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s18, 24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s26, s16, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s27, s18, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s18, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s16, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s14, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s12, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s33, s10, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s34, s8, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s6, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s4, 0xff +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s27, s27, 8 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s26, s26, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[18:19], s[18:19], 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[16:17], s[16:17], 16 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s17, s25, 8 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s19, s24, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[14:15], s[14:15], 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[12:13], s[12:13], 16 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s13, s23, 8 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s15, s22, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[10:11], s[10:11], 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s9, s21, 8 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s11, s20, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-NOHSA-SI-NEXT: s_or_b32 s5, s28, s27 +; GCN-NOHSA-SI-NEXT: s_or_b32 s7, s29, s26 +; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s18, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s16, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_or_b32 s17, s30, s17 +; GCN-NOHSA-SI-NEXT: s_or_b32 s19, s31, s19 +; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_or_b32 s13, s33, s13 +; GCN-NOHSA-SI-NEXT: s_or_b32 s15, s34, s15 +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_or_b32 s9, s35, s9 +; GCN-NOHSA-SI-NEXT: s_or_b32 s11, s36, s11 +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xff00ff +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v4, 24, v11 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 24, v9 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v7, v6, v9, 16 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v12, v4, v11, 16 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v13, v2, v10, 16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff00, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 24, v1 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v14, v14, v1, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 24, v0 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v15, v15, v0, 16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xff00, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v17, 0xff00, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xff, v0 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v6, 8, v2 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v9, v0 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v8, v6 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v11, v4 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v10, v3 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v3, 8, v17 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v8, 8, v16 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v10, v1, v3 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v8, v18, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v12 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v13 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff00ff, v14 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v9, 0xff00ff, v15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s10 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s14 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -11843,88 +11918,112 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v3 +; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-HSA-NEXT: s_lshr_b32 s5, s4, 24 +; GCN-HSA-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-HSA-NEXT: s_and_b32 s12, s4, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s13, s6, 0xff00 +; GCN-HSA-NEXT: s_lshr_b32 s9, s8, 24 +; GCN-HSA-NEXT: s_lshr_b32 s11, s10, 24 +; GCN-HSA-NEXT: s_and_b32 s14, s8, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s15, s10, 0xff00 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: s_lshr_b64 s[2:3], s[10:11], 16 +; GCN-HSA-NEXT: s_and_b32 s17, s8, 0xff +; GCN-HSA-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GCN-HSA-NEXT: s_and_b32 s3, s6, 0xff +; GCN-HSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-HSA-NEXT: s_and_b32 s16, s10, 0xff +; GCN-HSA-NEXT: s_lshl_b32 s14, s14, 8 +; GCN-HSA-NEXT: s_lshl_b32 s9, s13, 8 +; GCN-HSA-NEXT: s_and_b32 s10, s4, 0xff +; GCN-HSA-NEXT: s_lshl_b32 s11, s12, 8 +; GCN-HSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xff00ff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-HSA-NEXT: s_lshl_b32 s15, s15, 8 +; GCN-HSA-NEXT: s_or_b32 s7, s17, s14 +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xff00ff +; GCN-HSA-NEXT: s_or_b32 s3, s3, s9 +; GCN-HSA-NEXT: s_or_b32 s9, s10, s11 +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s8 +; GCN-HSA-NEXT: s_or_b32 s5, s16, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v1 +; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v3 +; GCN-HSA-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-HSA-NEXT: s_lshr_b32 s9, s8, 24 +; GCN-HSA-NEXT: s_lshr_b32 s3, s2, 24 +; GCN-HSA-NEXT: s_lshr_b32 s5, s4, 24 +; GCN-HSA-NEXT: s_and_b32 s10, s2, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s11, s4, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s12, s6, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s13, s8, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s14, s8, 0xff +; GCN-HSA-NEXT: s_and_b32 s15, s6, 0xff +; GCN-HSA-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GCN-HSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-HSA-NEXT: s_and_b32 s16, s4, 0xff +; GCN-HSA-NEXT: s_and_b32 s17, s2, 0xff +; GCN-HSA-NEXT: s_lshl_b32 s13, s13, 8 +; GCN-HSA-NEXT: s_lshl_b32 s12, s12, 8 +; GCN-HSA-NEXT: s_lshl_b32 s7, s11, 8 +; GCN-HSA-NEXT: s_lshl_b32 s9, s10, 8 +; GCN-HSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-HSA-NEXT: s_lshr_b64 s[2:3], s[2:3], 16 +; GCN-HSA-NEXT: s_or_b32 s3, s14, s13 +; GCN-HSA-NEXT: s_or_b32 s5, s15, s12 +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xff00ff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-HSA-NEXT: s_or_b32 s7, s16, s7 +; GCN-HSA-NEXT: s_or_b32 s9, s17, s9 +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-HSA-NEXT: s_and_b32 s10, s2, 0xff00ff ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v7 -; GCN-HSA-NEXT: v_alignbit_b32 v7, v9, v7, 16 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff00, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v6 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GCN-HSA-NEXT: v_alignbit_b32 v7, v7, v6, 16 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GCN-HSA-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v7 -; GCN-HSA-NEXT: v_or_b32_e32 v6, v6, v16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[6:9] -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v4 -; GCN-HSA-NEXT: v_alignbit_b32 v5, v9, v5, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GCN-HSA-NEXT: v_alignbit_b32 v9, v7, v4, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v6 -; GCN-HSA-NEXT: v_or_b32_e32 v6, v12, v8 -; GCN-HSA-NEXT: v_or_b32_e32 v4, v13, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v9 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xff00, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff00, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v2 -; GCN-HSA-NEXT: v_alignbit_b32 v1, v5, v1, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-HSA-NEXT: v_alignbit_b32 v0, v9, v0, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v5, 8, v13 -; GCN-HSA-NEXT: v_alignbit_b32 v9, v12, v3, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GCN-HSA-NEXT: v_alignbit_b32 v12, v19, v2, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v13, 8, v18 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-HSA-NEXT: v_or_b32_e32 v2, v6, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-HSA-NEXT: v_or_b32_e32 v0, v7, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v9 -; GCN-HSA-NEXT: v_or_b32_e32 v6, v10, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v12 -; GCN-HSA-NEXT: v_or_b32_e32 v4, v11, v13 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i8_to_v32i16: @@ -11942,79 +12041,95 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v3 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v7 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v5 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s7, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s6, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s4, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s18, s4, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s4, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 24, v2 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v5, 8, v4 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v11, 8, v2 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s9, s7, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s7, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 8 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s12, s6, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s6, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s5, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s15, s5, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s5, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 16 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s17, s17, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 24, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v9, 8, v6 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v10, 8, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xff0000, v5 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v13, v8, v2, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, 0xff0000, v11 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xff0000 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff0000 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xff0000 -; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s9, s8 -; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s12, s11 -; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s18, s17 -; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s19, s4 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v7, v7, v4, 16 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, 0xff0000, v9 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v3, v3, v0, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xff0000, v10 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v12, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v13, 0xff00ff, v13 -; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s7 -; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s13, s6 -; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s15, s14 -; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s16, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v5, 0xff00ff, v7 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v8, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, 0xff00ff, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v3 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v6 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v4 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s15, v5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s8, 24 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s14, v7 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s4, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s10, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s15, 24 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s20, s10, 8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s8, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s25, s8, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s13, 24 +; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s6, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s30, s6, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s12, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s33, s12, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s34, s12, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s12, s12, 8 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s36, s4, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s17, s15, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s15, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s15, s15, 8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s10, 0xff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s14, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s22, s14, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s14, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 8 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s27, s13, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s13, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s13, s13, 8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s4, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s16, s16, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s20, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[10:11], s[10:11], 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s26, 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s31, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s36, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s21, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s25, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s30, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s17, s16 +; GCN-NOHSA-VI-NEXT: s_or_b32 s16, s19, s20 +; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s33, s7 +; GCN-NOHSA-VI-NEXT: s_or_b32 s12, s34, s12 +; GCN-NOHSA-VI-NEXT: s_or_b32 s19, s35, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-NOHSA-VI-NEXT: s_or_b32 s15, s18, s15 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xff00ff +; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s22, s11 +; GCN-NOHSA-VI-NEXT: s_or_b32 s14, s23, s14 +; GCN-NOHSA-VI-NEXT: s_or_b32 s17, s24, s21 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xff00ff +; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s27, s9 +; GCN-NOHSA-VI-NEXT: s_or_b32 s13, s28, s13 +; GCN-NOHSA-VI-NEXT: s_or_b32 s18, s29, s25 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s14 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v32i8_to_v32i16: diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll index 52ef811875f88..a6c019bf374d7 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -18,7 +18,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v1, v0 @@ -40,7 +40,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -86,7 +86,7 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v1, v0 @@ -108,7 +108,7 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm From 62d3a1ec4ed63c097dba632a1792afcfb4a82b9f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 11 Nov 2025 11:21:42 -0800 Subject: [PATCH 28/64] [ARM][BPF][Lanai][MSP430] Use MCRegister::id() to avoid an implicit cast. NFC (#167537) --- llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp | 2 +- llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp | 2 +- llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp | 8 ++++---- llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index 01fe13b343926..f8196e460ae9c 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -1238,7 +1238,7 @@ uint64_t ARMAsmBackendDarwin::generateCompactUnwindEncoding( // Verify standard frame (lr/r7) was used. if (CFARegister != ARM::R7) { DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "frame register is " - << CFARegister + << CFARegister.id() << " instead of r7\n"); return CU::UNWIND_ARM_MODE_DWARF; } diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp index d96f403d2f814..9f86322a81b3e 100644 --- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp +++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp @@ -172,7 +172,7 @@ struct BPFOperand : public MCParsedAsmOperand { break; case Register: OS << ""; + OS << getReg().id() << ">"; break; case Token: OS << "'" << getToken() << "'"; diff --git a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp index cef77f1c512f6..0444c865f6866 100644 --- a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp +++ b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp @@ -559,7 +559,7 @@ struct LanaiOperand : public MCParsedAsmOperand { OS << "Token: " << getToken() << "\n"; break; case REGISTER: - OS << "Reg: %r" << getReg() << "\n"; + OS << "Reg: %r" << getReg().id() << "\n"; break; case MEMORY_IMM: OS << "MemImm: "; @@ -567,14 +567,14 @@ struct LanaiOperand : public MCParsedAsmOperand { OS << '\n'; break; case MEMORY_REG_IMM: - OS << "MemRegImm: " << getMemBaseReg() << "+"; + OS << "MemRegImm: " << getMemBaseReg().id() << "+"; MAI.printExpr(OS, *getMemOffset()); OS << '\n'; break; case MEMORY_REG_REG: assert(getMemOffset() == nullptr); - OS << "MemRegReg: " << getMemBaseReg() << "+" - << "%r" << getMemOffsetReg() << "\n"; + OS << "MemRegReg: " << getMemBaseReg().id() << "+" + << "%r" << getMemOffsetReg().id() << "\n"; break; } } diff --git a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp index a31c8ec1b2bb5..a8891d686abe8 100644 --- a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp +++ b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp @@ -230,7 +230,7 @@ class MSP430Operand : public MCParsedAsmOperand { O << "Token " << Tok; break; case k_Reg: - O << "Register " << Reg; + O << "Register " << Reg.id(); break; case k_Imm: O << "Immediate "; @@ -241,10 +241,10 @@ class MSP430Operand : public MCParsedAsmOperand { MAI.printExpr(O, *Mem.Offset); break; case k_IndReg: - O << "RegInd " << Reg; + O << "RegInd " << Reg.id(); break; case k_PostIndReg: - O << "PostInc " << Reg; + O << "PostInc " << Reg.id(); break; } } From 7f061897703e59f64b1984d27da4a6efd37f4f19 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 11 Nov 2025 11:22:00 -0800 Subject: [PATCH 29/64] [AArch64] Use MCRegister instead of unsigned. NFC (#167547) --- .../AArch64/AsmParser/AArch64AsmParser.cpp | 161 +++++++++--------- 1 file changed, 79 insertions(+), 82 deletions(-) diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 6273cfc1005d6..f5dfbdc596510 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -88,7 +88,7 @@ class AArch64AsmParser : public MCTargetAsmParser { StringRef Mnemonic; ///< Instruction mnemonic. // Map of register aliases registers via the .req directive. - StringMap> RegisterReqs; + StringMap> RegisterReqs; class PrefixInfo { public: @@ -165,7 +165,7 @@ class AArch64AsmParser : public MCTargetAsmParser { AArch64CC::CondCode parseCondCodeString(StringRef Cond, std::string &Suggestion); bool parseCondCode(OperandVector &Operands, bool invertCondCode); - unsigned matchRegisterNameAlias(StringRef Name, RegKind Kind); + MCRegister matchRegisterNameAlias(StringRef Name, RegKind Kind); bool parseRegister(OperandVector &Operands); bool parseSymbolicImmVal(const MCExpr *&ImmVal); bool parseNeonVectorList(OperandVector &Operands); @@ -391,7 +391,7 @@ class AArch64Operand : public MCParsedAsmOperand { }; struct RegOp { - unsigned RegNum; + MCRegister Reg; RegKind Kind; int ElementWidth; @@ -417,7 +417,7 @@ class AArch64Operand : public MCParsedAsmOperand { }; struct MatrixRegOp { - unsigned RegNum; + MCRegister Reg; unsigned ElementWidth; MatrixKind Kind; }; @@ -427,7 +427,7 @@ class AArch64Operand : public MCParsedAsmOperand { }; struct VectorListOp { - unsigned RegNum; + MCRegister Reg; unsigned Count; unsigned Stride; unsigned NumElements; @@ -688,12 +688,12 @@ class AArch64Operand : public MCParsedAsmOperand { MCRegister getReg() const override { assert(Kind == k_Register && "Invalid access!"); - return Reg.RegNum; + return Reg.Reg; } - unsigned getMatrixReg() const { + MCRegister getMatrixReg() const { assert(Kind == k_MatrixRegister && "Invalid access!"); - return MatrixReg.RegNum; + return MatrixReg.Reg; } unsigned getMatrixElementWidth() const { @@ -716,9 +716,9 @@ class AArch64Operand : public MCParsedAsmOperand { return Reg.EqualityTy; } - unsigned getVectorListStart() const { + MCRegister getVectorListStart() const { assert(Kind == k_VectorList && "Invalid access!"); - return VectorList.RegNum; + return VectorList.Reg; } unsigned getVectorListCount() const { @@ -1264,15 +1264,15 @@ class AArch64Operand : public MCParsedAsmOperand { bool isNeonVectorRegLo() const { return Kind == k_Register && Reg.Kind == RegKind::NeonVector && (AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains( - Reg.RegNum) || + Reg.Reg) || AArch64MCRegisterClasses[AArch64::FPR64_loRegClassID].contains( - Reg.RegNum)); + Reg.Reg)); } bool isNeonVectorReg0to7() const { return Kind == k_Register && Reg.Kind == RegKind::NeonVector && (AArch64MCRegisterClasses[AArch64::FPR128_0to7RegClassID].contains( - Reg.RegNum)); + Reg.Reg)); } bool isMatrix() const { return Kind == k_MatrixRegister; } @@ -1401,34 +1401,34 @@ class AArch64Operand : public MCParsedAsmOperand { bool isGPR32as64() const { return Kind == k_Register && Reg.Kind == RegKind::Scalar && - AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum); + AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.Reg); } bool isGPR64as32() const { return Kind == k_Register && Reg.Kind == RegKind::Scalar && - AArch64MCRegisterClasses[AArch64::GPR32RegClassID].contains(Reg.RegNum); + AArch64MCRegisterClasses[AArch64::GPR32RegClassID].contains(Reg.Reg); } bool isGPR64x8() const { return Kind == k_Register && Reg.Kind == RegKind::Scalar && AArch64MCRegisterClasses[AArch64::GPR64x8ClassRegClassID].contains( - Reg.RegNum); + Reg.Reg); } bool isWSeqPair() const { return Kind == k_Register && Reg.Kind == RegKind::Scalar && AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID].contains( - Reg.RegNum); + Reg.Reg); } bool isXSeqPair() const { return Kind == k_Register && Reg.Kind == RegKind::Scalar && AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID].contains( - Reg.RegNum); + Reg.Reg); } bool isSyspXzrPair() const { - return isGPR64() && Reg.RegNum == AArch64::XZR; + return isGPR64() && Reg.Reg == AArch64::XZR; } template @@ -1495,7 +1495,7 @@ class AArch64Operand : public MCParsedAsmOperand { isTypedVectorList(); if (!Res) return DiagnosticPredicate::NoMatch; - if (!AArch64MCRegisterClasses[RegClass].contains(VectorList.RegNum)) + if (!AArch64MCRegisterClasses[RegClass].contains(VectorList.Reg)) return DiagnosticPredicate::NearMatch; return DiagnosticPredicate::Match; } @@ -1507,9 +1507,9 @@ class AArch64Operand : public MCParsedAsmOperand { ElementWidth, Stride>(); if (!Res) return DiagnosticPredicate::NoMatch; - if ((VectorList.RegNum < (AArch64::Z0 + Stride)) || - ((VectorList.RegNum >= AArch64::Z16) && - (VectorList.RegNum < (AArch64::Z16 + Stride)))) + if ((VectorList.Reg < (AArch64::Z0 + Stride)) || + ((VectorList.Reg >= AArch64::Z16) && + (VectorList.Reg < (AArch64::Z16 + Stride)))) return DiagnosticPredicate::Match; return DiagnosticPredicate::NoMatch; } @@ -1841,7 +1841,7 @@ class AArch64Operand : public MCParsedAsmOperand { void addPPRorPNRRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - unsigned Reg = getReg(); + MCRegister Reg = getReg(); // Normalise to PPR if (Reg >= AArch64::PN0 && Reg <= AArch64::PN15) Reg = Reg - AArch64::PN0 + AArch64::P0; @@ -2336,13 +2336,12 @@ class AArch64Operand : public MCParsedAsmOperand { } static std::unique_ptr - CreateReg(unsigned RegNum, RegKind Kind, SMLoc S, SMLoc E, MCContext &Ctx, + CreateReg(MCRegister Reg, RegKind Kind, SMLoc S, SMLoc E, MCContext &Ctx, RegConstraintEqualityTy EqTy = RegConstraintEqualityTy::EqualsReg, AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL, - unsigned ShiftAmount = 0, - unsigned HasExplicitAmount = false) { + unsigned ShiftAmount = 0, unsigned HasExplicitAmount = false) { auto Op = std::make_unique(k_Register, Ctx); - Op->Reg.RegNum = RegNum; + Op->Reg.Reg = Reg; Op->Reg.Kind = Kind; Op->Reg.ElementWidth = 0; Op->Reg.EqualityTy = EqTy; @@ -2354,28 +2353,26 @@ class AArch64Operand : public MCParsedAsmOperand { return Op; } - static std::unique_ptr - CreateVectorReg(unsigned RegNum, RegKind Kind, unsigned ElementWidth, - SMLoc S, SMLoc E, MCContext &Ctx, - AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL, - unsigned ShiftAmount = 0, - unsigned HasExplicitAmount = false) { + static std::unique_ptr CreateVectorReg( + MCRegister Reg, RegKind Kind, unsigned ElementWidth, SMLoc S, SMLoc E, + MCContext &Ctx, AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL, + unsigned ShiftAmount = 0, unsigned HasExplicitAmount = false) { assert((Kind == RegKind::NeonVector || Kind == RegKind::SVEDataVector || Kind == RegKind::SVEPredicateVector || Kind == RegKind::SVEPredicateAsCounter) && "Invalid vector kind"); - auto Op = CreateReg(RegNum, Kind, S, E, Ctx, EqualsReg, ExtTy, ShiftAmount, + auto Op = CreateReg(Reg, Kind, S, E, Ctx, EqualsReg, ExtTy, ShiftAmount, HasExplicitAmount); Op->Reg.ElementWidth = ElementWidth; return Op; } static std::unique_ptr - CreateVectorList(unsigned RegNum, unsigned Count, unsigned Stride, + CreateVectorList(MCRegister Reg, unsigned Count, unsigned Stride, unsigned NumElements, unsigned ElementWidth, RegKind RegisterKind, SMLoc S, SMLoc E, MCContext &Ctx) { auto Op = std::make_unique(k_VectorList, Ctx); - Op->VectorList.RegNum = RegNum; + Op->VectorList.Reg = Reg; Op->VectorList.Count = Count; Op->VectorList.Stride = Stride; Op->VectorList.NumElements = NumElements; @@ -2586,10 +2583,10 @@ class AArch64Operand : public MCParsedAsmOperand { } static std::unique_ptr - CreateMatrixRegister(unsigned RegNum, unsigned ElementWidth, MatrixKind Kind, + CreateMatrixRegister(MCRegister Reg, unsigned ElementWidth, MatrixKind Kind, SMLoc S, SMLoc E, MCContext &Ctx) { auto Op = std::make_unique(k_MatrixRegister, Ctx); - Op->MatrixReg.RegNum = RegNum; + Op->MatrixReg.Reg = Reg; Op->MatrixReg.ElementWidth = ElementWidth; Op->MatrixReg.Kind = Kind; Op->StartLoc = S; @@ -2660,9 +2657,9 @@ void AArch64Operand::print(raw_ostream &OS, const MCAsmInfo &MAI) const { break; case k_VectorList: { OS << ""; break; } @@ -2699,7 +2696,7 @@ void AArch64Operand::print(raw_ostream &OS, const MCAsmInfo &MAI) const { OS << getCMHPriorityHintName(); break; case k_MatrixRegister: - OS << ""; + OS << ""; break; case k_MatrixTileList: { OS << ""; + OS << ""; if (!getShiftExtendAmount() && !hasShiftExtendAmount()) break; [[fallthrough]]; @@ -3048,53 +3045,53 @@ ParseStatus AArch64AsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, } // Matches a register name or register alias previously defined by '.req' -unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name, - RegKind Kind) { - unsigned RegNum = 0; - if ((RegNum = matchSVEDataVectorRegName(Name))) - return Kind == RegKind::SVEDataVector ? RegNum : 0; +MCRegister AArch64AsmParser::matchRegisterNameAlias(StringRef Name, + RegKind Kind) { + MCRegister Reg = MCRegister(); + if ((Reg = matchSVEDataVectorRegName(Name))) + return Kind == RegKind::SVEDataVector ? Reg : MCRegister(); - if ((RegNum = matchSVEPredicateVectorRegName(Name))) - return Kind == RegKind::SVEPredicateVector ? RegNum : 0; + if ((Reg = matchSVEPredicateVectorRegName(Name))) + return Kind == RegKind::SVEPredicateVector ? Reg : MCRegister(); - if ((RegNum = matchSVEPredicateAsCounterRegName(Name))) - return Kind == RegKind::SVEPredicateAsCounter ? RegNum : 0; + if ((Reg = matchSVEPredicateAsCounterRegName(Name))) + return Kind == RegKind::SVEPredicateAsCounter ? Reg : MCRegister(); - if ((RegNum = MatchNeonVectorRegName(Name))) - return Kind == RegKind::NeonVector ? RegNum : 0; + if ((Reg = MatchNeonVectorRegName(Name))) + return Kind == RegKind::NeonVector ? Reg : MCRegister(); - if ((RegNum = matchMatrixRegName(Name))) - return Kind == RegKind::Matrix ? RegNum : 0; + if ((Reg = matchMatrixRegName(Name))) + return Kind == RegKind::Matrix ? Reg : MCRegister(); - if (Name.equals_insensitive("zt0")) + if (Name.equals_insensitive("zt0")) return Kind == RegKind::LookupTable ? unsigned(AArch64::ZT0) : 0; // The parsed register must be of RegKind Scalar - if ((RegNum = MatchRegisterName(Name))) - return (Kind == RegKind::Scalar) ? RegNum : 0; + if ((Reg = MatchRegisterName(Name))) + return (Kind == RegKind::Scalar) ? Reg : MCRegister(); - if (!RegNum) { + if (!Reg) { // Handle a few common aliases of registers. - if (auto RegNum = StringSwitch(Name.lower()) - .Case("fp", AArch64::FP) - .Case("lr", AArch64::LR) - .Case("x31", AArch64::XZR) - .Case("w31", AArch64::WZR) - .Default(0)) - return Kind == RegKind::Scalar ? RegNum : 0; + if (MCRegister Reg = StringSwitch(Name.lower()) + .Case("fp", AArch64::FP) + .Case("lr", AArch64::LR) + .Case("x31", AArch64::XZR) + .Case("w31", AArch64::WZR) + .Default(0)) + return Kind == RegKind::Scalar ? Reg : MCRegister(); // Check for aliases registered via .req. Canonicalize to lower case. // That's more consistent since register names are case insensitive, and // it's how the original entry was passed in from MC/MCParser/AsmParser. auto Entry = RegisterReqs.find(Name.lower()); if (Entry == RegisterReqs.end()) - return 0; + return MCRegister(); - // set RegNum if the match is the right kind of register + // set Reg if the match is the right kind of register if (Kind == Entry->getValue().first) - RegNum = Entry->getValue().second; + Reg = Entry->getValue().second; } - return RegNum; + return Reg; } unsigned AArch64AsmParser::getNumRegsForRegKind(RegKind K) { @@ -3122,8 +3119,8 @@ ParseStatus AArch64AsmParser::tryParseScalarRegister(MCRegister &RegNum) { return ParseStatus::NoMatch; std::string lowerCase = Tok.getString().lower(); - unsigned Reg = matchRegisterNameAlias(lowerCase, RegKind::Scalar); - if (Reg == 0) + MCRegister Reg = matchRegisterNameAlias(lowerCase, RegKind::Scalar); + if (!Reg) return ParseStatus::NoMatch; RegNum = Reg; @@ -3667,7 +3664,7 @@ ParseStatus AArch64AsmParser::tryParseMatrixRegister(OperandVector &Operands) { } // Try to parse matrix register. - unsigned Reg = matchRegisterNameAlias(Name, RegKind::Matrix); + MCRegister Reg = matchRegisterNameAlias(Name, RegKind::Matrix); if (!Reg) return ParseStatus::NoMatch; @@ -4130,12 +4127,12 @@ bool AArch64AsmParser::parseSyslAlias(StringRef Name, SMLoc NameLoc, SMLoc startLoc = getLoc(); const AsmToken ®Tok = getTok(); StringRef reg = regTok.getString(); - unsigned RegNum = matchRegisterNameAlias(reg.lower(), RegKind::Scalar); - if (!RegNum) + MCRegister Reg = matchRegisterNameAlias(reg.lower(), RegKind::Scalar); + if (!Reg) return TokError("expected register operand"); Operands.push_back(AArch64Operand::CreateReg( - RegNum, RegKind::Scalar, startLoc, getLoc(), getContext(), EqualsReg)); + Reg, RegKind::Scalar, startLoc, getLoc(), getContext(), EqualsReg)); Lex(); // Eat token if (parseToken(AsmToken::Comma)) @@ -4453,7 +4450,7 @@ ParseStatus AArch64AsmParser::tryParseVectorRegister(MCRegister &Reg, // a '.'. size_t Start = 0, Next = Name.find('.'); StringRef Head = Name.slice(Start, Next); - unsigned RegNum = matchRegisterNameAlias(Head, MatchKind); + MCRegister RegNum = matchRegisterNameAlias(Head, MatchKind); if (RegNum) { if (Next != StringRef::npos) { @@ -4937,13 +4934,13 @@ ParseStatus AArch64AsmParser::tryParseZTOperand(OperandVector &Operands) { const AsmToken &Tok = getTok(); std::string Name = Tok.getString().lower(); - unsigned RegNum = matchRegisterNameAlias(Name, RegKind::LookupTable); + MCRegister Reg = matchRegisterNameAlias(Name, RegKind::LookupTable); - if (RegNum == 0) + if (!Reg) return ParseStatus::NoMatch; Operands.push_back(AArch64Operand::CreateReg( - RegNum, RegKind::LookupTable, StartLoc, getLoc(), getContext())); + Reg, RegKind::LookupTable, StartLoc, getLoc(), getContext())); Lex(); // Eat register. // Check if register is followed by an index @@ -7651,7 +7648,7 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) { if (parseEOL()) return true; - auto pair = std::make_pair(RegisterKind, (unsigned) RegNum); + auto pair = std::make_pair(RegisterKind, RegNum); if (RegisterReqs.insert(std::make_pair(Name, pair)).first->second != pair) Warning(L, "ignoring redefinition of register alias '" + Name + "'"); From 298c25aa20144345e250758fd54529758ac3737a Mon Sep 17 00:00:00 2001 From: Paul Kirth Date: Tue, 11 Nov 2025 11:26:43 -0800 Subject: [PATCH 30/64] [llvm][asmprinter] Make call graph section deterministic (#167400) The call-graph-section-assembly.ll tests in CodeGen/X86 and CodeGen/Aarch64 bot fail under LLVM_REVERSE_ITERATION. These sets should use SetVector to avoid non-determinism in the ouput. --- llvm/include/llvm/CodeGen/AsmPrinter.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h index 9ace2555b4b62..311f7df98cf8c 100644 --- a/llvm/include/llvm/CodeGen/AsmPrinter.h +++ b/llvm/include/llvm/CodeGen/AsmPrinter.h @@ -18,7 +18,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/IntrusiveRefCntPtr.h" #include "llvm/ADT/MapVector.h" -#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/StaticDataProfileInfo.h" @@ -207,9 +207,9 @@ class LLVM_ABI AsmPrinter : public MachineFunctionPass { using CGTypeId = uint64_t; /// Unique target type IDs. - SmallSet IndirectCalleeTypeIDs; + SmallSetVector IndirectCalleeTypeIDs; /// Unique direct callees. - SmallSet DirectCallees; + SmallSetVector DirectCallees; }; enum CallGraphSectionFormatVersion : uint8_t { From 9f069d9d5fbb2644b0d2e62a4a8835c11498dedc Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 11 Nov 2025 11:27:18 -0800 Subject: [PATCH 31/64] [AVR] Remove implicit conversions of MCRegister to unsigned. NFC (#167566) Use MCRegister instead of MCPhysReg or use MCRegister::id(). --- llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp | 4 ++-- llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp | 4 ++-- llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp index fc794c4968b8c..48452f6d9391c 100644 --- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp +++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp @@ -252,7 +252,7 @@ class AVROperand : public MCParsedAsmOperand { O << "Token: \"" << getToken() << "\""; break; case k_Register: - O << "Register: " << getReg(); + O << "Register: " << getReg().id(); break; case k_Immediate: O << "Immediate: \""; @@ -262,7 +262,7 @@ class AVROperand : public MCParsedAsmOperand { case k_Memri: { // only manually print the size for non-negative values, // as the sign is inserted automatically. - O << "Memri: \"" << getReg() << '+'; + O << "Memri: \"" << getReg().id() << '+'; MAI.printExpr(O, *getImm()); O << "\""; break; diff --git a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp index 5548ad1ebff5e..84a64ba0aa4ff 100644 --- a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp +++ b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp @@ -82,7 +82,7 @@ static DecodeStatus DecodeGPR8RegisterClass(MCInst &Inst, unsigned RegNo, if (RegNo > 31) return MCDisassembler::Fail; - unsigned Register = GPRDecoderTable[RegNo]; + MCRegister Register = GPRDecoderTable[RegNo]; Inst.addOperand(MCOperand::createReg(Register)); return MCDisassembler::Success; } @@ -174,7 +174,7 @@ static DecodeStatus decodeLoadStore(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { // Get the register will be loaded or stored. - unsigned RegVal = GPRDecoderTable[(Insn >> 4) & 0x1f]; + MCRegister RegVal = GPRDecoderTable[(Insn >> 4) & 0x1f]; // Decode LDD/STD with offset less than 8. if ((Insn & 0xf000) == 0x8000) { diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp index 4bb16e237db48..fbb130ccde681 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp @@ -96,7 +96,7 @@ AVRMCCodeEmitter::loadStorePostEncoder(const MCInst &MI, unsigned EncodedValue, EncodedValue |= (1 << 12); // Encode the pointer register. - switch (MI.getOperand(Idx).getReg()) { + switch (MI.getOperand(Idx).getReg().id()) { case AVR::R27R26: EncodedValue |= 0xc; break; From f5e2c5ddcec758ffdaff027a239a163331a73292 Mon Sep 17 00:00:00 2001 From: Paul Kirth Date: Tue, 11 Nov 2025 11:29:52 -0800 Subject: [PATCH 32/64] [clang][test] Fix test issue under LLVM_REVERSE_ITERATION (#167394) The order of these items may not be the same if reverse iteration is enabled. From what I can tell this is related to visitation order, and I don't see an easy way to handle that using the typical solutions, like MapVector, etc. For now, just use CHECK-DAG to get the test into a passing state. Fixes #167057 --- clang/test/Analysis/analyzeOneFunction.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/clang/test/Analysis/analyzeOneFunction.cpp b/clang/test/Analysis/analyzeOneFunction.cpp index 3a362dfd9a08c..b2257570f6052 100644 --- a/clang/test/Analysis/analyzeOneFunction.cpp +++ b/clang/test/Analysis/analyzeOneFunction.cpp @@ -5,9 +5,9 @@ // RUN: -analyze-function="c:@S@Window@F@overloaded#I#" // RUN: %clang_extdef_map %s | FileCheck %s -// CHECK: 27:c:@S@Window@F@overloaded#I# -// CHECK-NEXT: 27:c:@S@Window@F@overloaded#C# -// CHECK-NEXT: 27:c:@S@Window@F@overloaded#d# +// CHECK-DAG: 27:c:@S@Window@F@overloaded#I# +// CHECK-DAG: 27:c:@S@Window@F@overloaded#C# +// CHECK-DAG: 27:c:@S@Window@F@overloaded#d# void clang_analyzer_warnIfReached(); From 5e30fe830d8e3827d06ee258f7f8f28ed17bc0e7 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Tue, 11 Nov 2025 19:35:49 +0000 Subject: [PATCH 33/64] [gn build] Port 17ce48f2b687 --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index e377b380685fd..b36466b3fac8b 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -1505,7 +1505,6 @@ if (current_toolchain == default_toolchain) { "__type_traits/is_reference.h", "__type_traits/is_reference_wrapper.h", "__type_traits/is_referenceable.h", - "__type_traits/is_replaceable.h", "__type_traits/is_same.h", "__type_traits/is_scalar.h", "__type_traits/is_signed.h", From 9fce00469d03d7bf9acbbcd6597206cbbbbbe238 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Tue, 11 Nov 2025 19:35:50 +0000 Subject: [PATCH 34/64] [gn build] Port 82180558fea9 --- .../gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn index d8dfcbd3bac2d..854aa296ba161 100644 --- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn @@ -32,6 +32,7 @@ static_library("JITLink") { "ELF_loongarch.cpp", "ELF_ppc64.cpp", "ELF_riscv.cpp", + "ELF_systemz.cpp", "ELF_x86.cpp", "ELF_x86_64.cpp", "JITLink.cpp", @@ -49,6 +50,7 @@ static_library("JITLink") { "loongarch.cpp", "ppc64.cpp", "riscv.cpp", + "systemz.cpp", "x86.cpp", "x86_64.cpp", ] From e77001c9f5b5e99d915a2f45f9f067e83def22e1 Mon Sep 17 00:00:00 2001 From: Sterling-Augustine Date: Tue, 11 Nov 2025 11:49:10 -0800 Subject: [PATCH 35/64] [libc] Use a sensible default when TEST_UNDECLARED_OUTPUTS_DIR is unset. (#167422) There is no guarantee that this environment variable is set. Eg, when running a test outside of the build system, such as under a debugger. And passing a nullptr to the string constructor is undefined. Use an empty string, which seems like it is close to the original intent. --- libc/test/UnitTest/BazelFilePath.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libc/test/UnitTest/BazelFilePath.cpp b/libc/test/UnitTest/BazelFilePath.cpp index ee5fcaaa63d91..7f9f42b46dca9 100644 --- a/libc/test/UnitTest/BazelFilePath.cpp +++ b/libc/test/UnitTest/BazelFilePath.cpp @@ -20,6 +20,10 @@ namespace testing { CString libc_make_test_file_path_func(const char *file_name) { // This is the path to the folder bazel wants the test outputs written to. const char *UNDECLARED_OUTPUTS_PATH = getenv("TEST_UNDECLARED_OUTPUTS_DIR"); + // Do something sensible if not run under bazel, otherwise this may segfault + // when constructing the string. + if (UNDECLARED_OUTPUTS_PATH == nullptr) + UNDECLARED_OUTPUTS_PATH = ""; return cpp::string(UNDECLARED_OUTPUTS_PATH) + file_name; } From cc5057cc2153924d414aac0d4cc7f08d205cd079 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Tue, 11 Nov 2025 11:49:45 -0800 Subject: [PATCH 36/64] [CAS] Fix AIX build (#159647) Fix AIX build by linking `libbsd.a` to include implementation for `flock`. --- llvm/lib/CAS/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt index aad77dce370d8..b03895cfc77d7 100644 --- a/llvm/lib/CAS/CMakeLists.txt +++ b/llvm/lib/CAS/CMakeLists.txt @@ -1,3 +1,7 @@ +if (UNIX AND "${CMAKE_SYSTEM_NAME}" MATCHES "AIX") + set(additional_libs bsd) +endif() + add_llvm_component_library(LLVMCAS ActionCache.cpp ActionCaches.cpp @@ -20,6 +24,7 @@ add_llvm_component_library(LLVMCAS LINK_LIBS ${LLVM_PTHREAD_LIB} + ${additional_libs} LINK_COMPONENTS Support From c9ff2df8c3233dcb1d3e45039569dd1b9235ad52 Mon Sep 17 00:00:00 2001 From: Daniel Thornburgh Date: Tue, 11 Nov 2025 11:52:56 -0800 Subject: [PATCH 37/64] [IR] "modular-format" attribute for functions using format strings (#147429) A new InstCombine transform uses this attribute to rewrite calls to a modular version of the implementation along with llvm.reloc.none relocations against aspects of the implementation needed by the call. This change only adds support for the 'float' aspect, but it also builds the structure needed for others. See issue #146159 --- llvm/docs/LangRef.rst | 21 ++++ llvm/docs/ReleaseNotes.md | 4 + llvm/lib/IR/Verifier.cpp | 14 +++ .../InstCombine/InstCombineCalls.cpp | 69 ++++++++++++ .../Transforms/InstCombine/modular-format.ll | 105 ++++++++++++++++++ llvm/test/Verifier/modular-format.ll | 41 +++++++ 6 files changed, 254 insertions(+) create mode 100644 llvm/test/Transforms/InstCombine/modular-format.ll create mode 100644 llvm/test/Verifier/modular-format.ll diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index ab085ca0b1499..2a6b6a612f541 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -2747,6 +2747,27 @@ For example: all arguments are not undef and not poison. Otherwise, it is undefined behavior. +``"modular-format"=",,,,,"`` + This attribute indicates that the implementation is modular on a particular + format string argument. If the compiler can determine that not all aspects + of the implementation are needed, it can report which aspects were needed + and redirect the call to a modular implementation function instead. + + The compiler reports that an implementation aspect is needed by issuing a + relocation for the symbol `_``. This arranges for code + and data needed to support the aspect of the implementation to be brought + into the link to satisfy weak references in the modular implemenation + function. + + The first three arguments have the same semantics as the arguments to the C + ``format`` attribute. + + The following aspects are currently supported: + + - ``float``: The call has a floating point argument + + + Call Site Attributes ---------------------- diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index fd78c97c86d24..c76717fdc990c 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -70,6 +70,10 @@ Changes to the LLVM IR * Added `@llvm.reloc.none` intrinsic to emit null relocations to symbols. This emits an undefined symbol reference without adding any dedicated code or data to to bear the relocation. +* Added `modular-format` attribute to dynamically pull in aspects of libc + format string function implementations from statically-linked libc's based on + the requirements of each call. Currently only `float` is supported; this can + keep floating point support out of printf if it can be proven unused. Changes to LLVM infrastructure ------------------------------ diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 45f3c1bcbf5f3..fa18c3cd0f404 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -2567,6 +2567,20 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, CheckFailed("invalid value for 'denormal-fp-math-f32' attribute: " + S, V); } + + if (auto A = Attrs.getFnAttr("modular-format"); A.isValid()) { + StringRef S = A.getValueAsString(); + SmallVector Args; + S.split(Args, ','); + Check(Args.size() >= 5, + "modular-format attribute requires at least 5 arguments", V); + unsigned FirstArgIdx; + Check(!Args[2].getAsInteger(10, FirstArgIdx), + "modular-format attribute first arg index is not an integer", V); + unsigned UpperBound = FT->getNumParams() + (FT->isVarArg() ? 1 : 0); + Check(FirstArgIdx > 0 && FirstArgIdx <= UpperBound, + "modular-format attribute first arg index is out of bounds", V); + } } void Verifier::verifyUnknownProfileMetadata(MDNode *MD) { Check(MD->getNumOperands() == 2, diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 92fca90ddb88a..8e4edefec42fd 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumeBundleQueries.h" #include "llvm/Analysis/AssumptionCache.h" @@ -4091,6 +4092,70 @@ Instruction *InstCombinerImpl::visitCallBrInst(CallBrInst &CBI) { return visitCallBase(CBI); } +static Value *optimizeModularFormat(CallInst *CI, IRBuilderBase &B) { + if (!CI->hasFnAttr("modular-format")) + return nullptr; + + SmallVector Args( + llvm::split(CI->getFnAttr("modular-format").getValueAsString(), ',')); + // TODO: Make use of the first two arguments + unsigned FirstArgIdx; + [[maybe_unused]] bool Error; + Error = Args[2].getAsInteger(10, FirstArgIdx); + assert(!Error && "invalid first arg index"); + --FirstArgIdx; + StringRef FnName = Args[3]; + StringRef ImplName = Args[4]; + ArrayRef AllAspects = ArrayRef(Args).drop_front(5); + + if (AllAspects.empty()) + return nullptr; + + SmallVector NeededAspects; + for (StringRef Aspect : AllAspects) { + if (Aspect == "float") { + if (llvm::any_of( + llvm::make_range(std::next(CI->arg_begin(), FirstArgIdx), + CI->arg_end()), + [](Value *V) { return V->getType()->isFloatingPointTy(); })) + NeededAspects.push_back("float"); + } else { + // Unknown aspects are always considered to be needed. + NeededAspects.push_back(Aspect); + } + } + + if (NeededAspects.size() == AllAspects.size()) + return nullptr; + + Module *M = CI->getModule(); + LLVMContext &Ctx = M->getContext(); + Function *Callee = CI->getCalledFunction(); + FunctionCallee ModularFn = M->getOrInsertFunction( + FnName, Callee->getFunctionType(), + Callee->getAttributes().removeFnAttribute(Ctx, "modular-format")); + CallInst *New = cast(CI->clone()); + New->setCalledFunction(ModularFn); + New->removeFnAttr("modular-format"); + B.Insert(New); + + const auto ReferenceAspect = [&](StringRef Aspect) { + SmallString<20> Name = ImplName; + Name += '_'; + Name += Aspect; + Function *RelocNoneFn = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::reloc_none); + B.CreateCall(RelocNoneFn, + {MetadataAsValue::get(Ctx, MDString::get(Ctx, Name))}); + }; + + llvm::sort(NeededAspects); + for (StringRef Request : NeededAspects) + ReferenceAspect(Request); + + return New; +} + Instruction *InstCombinerImpl::tryOptimizeCall(CallInst *CI) { if (!CI->getCalledFunction()) return nullptr; @@ -4112,6 +4177,10 @@ Instruction *InstCombinerImpl::tryOptimizeCall(CallInst *CI) { ++NumSimplified; return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With); } + if (Value *With = optimizeModularFormat(CI, Builder)) { + ++NumSimplified; + return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With); + } return nullptr; } diff --git a/llvm/test/Transforms/InstCombine/modular-format.ll b/llvm/test/Transforms/InstCombine/modular-format.ll new file mode 100644 index 0000000000000..d9b7b6f056f59 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/modular-format.ll @@ -0,0 +1,105 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; Test that the modular format string library call simplifier works correctly. +; +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" + +@.str.int = constant [3 x i8] c"%d\00" +@.str.float = constant [3 x i8] c"%f\00" +@.str.multi = constant [6 x i8] c"%f %d\00" +@.str.noargs = constant [1 x i8] c"\00" + +;; No aspects are specified, so no transformation occurs. +define void @test_basic(i32 %arg) { +; CHECK-LABEL: @test_basic( +; CHECK-NEXT: call void (ptr, ...) @basic(ptr nonnull @.str.int, i32 [[ARG:%.*]]) +; CHECK-NEXT: ret void +; + call void (ptr, ...) @basic(ptr @.str.int, i32 %arg) + ret void +} + +declare void @basic(ptr, ...) #0 + +;; The "float" aspect is present and needed, so no transformation occurs. +define void @test_float_present(double %arg) { +; CHECK-LABEL: @test_float_present( +; CHECK-NEXT: call void (ptr, ...) @float_present(ptr nonnull @.str.float, double [[ARG:%.*]]) +; CHECK-NEXT: ret void +; + call void (ptr, ...) @float_present(ptr @.str.float, double %arg) + ret void +} + +declare void @float_present(ptr, ...) #1 + +;; The "float" aspect is present but not needed, so the call is transformed. +define void @test_float_absent(i32 %arg) { +; CHECK-LABEL: @test_float_absent( +; CHECK-NEXT: call void (ptr, ...) @float_present_mod(ptr nonnull @.str.int, i32 [[ARG:%.*]]) +; CHECK-NEXT: ret void +; + call void (ptr, ...) @float_absent(ptr @.str.int, i32 %arg) + ret void +} + +declare void @float_absent(ptr, ...) #1 + +;; Unknown aspects are always considered needed, so no transformation occurs. +define void @test_unknown_aspects(i32 %arg) { +; CHECK-LABEL: @test_unknown_aspects( +; CHECK-NEXT: call void (ptr, ...) @unknown_aspects(ptr nonnull @.str.int, i32 [[ARG:%.*]]) +; CHECK-NEXT: ret void +; + call void (ptr, ...) @unknown_aspects(ptr @.str.int, i32 %arg) + ret void +} + +declare void @unknown_aspects(ptr, ...) #2 + +;; The call has no arguments to check, so the "float" aspect is not needed and +;; the call is transformed. +define void @test_no_args_to_check() { +; CHECK-LABEL: @test_no_args_to_check( +; CHECK-NEXT: call void (ptr, ...) @float_present_mod(ptr nonnull @.str.noargs) +; CHECK-NEXT: ret void +; + call void (ptr, ...) @no_args_to_check(ptr @.str.noargs) + ret void +} + +declare void @no_args_to_check(ptr, ...) #1 + +;; The first argument index is not 2. The "float" aspect is needed, so no +;; transformation occurs. +define void @test_first_arg_idx(i32 %ignored, double %arg) { +; CHECK-LABEL: @test_first_arg_idx( +; CHECK-NEXT: call void (i32, ptr, ...) @first_arg_idx(i32 [[IGNORED:%.*]], ptr nonnull @.str.float, double [[ARG:%.*]]) +; CHECK-NEXT: ret void +; + call void (i32, ptr, ...) @first_arg_idx(i32 %ignored, ptr @.str.float, double %arg) + ret void +} + +declare void @first_arg_idx(i32, ptr, ...) #3 + +;; One aspect ("unknown") is needed, but one ("float") is not. The call is +;; transformed, and a reference to the needed aspect is emitted. +define void @test_partial_aspects(i32 %arg) { +; CHECK-LABEL: @test_partial_aspects( +; CHECK-NEXT: call void (ptr, ...) @multiple_aspects_mod(ptr nonnull @.str.int, i32 [[ARG:%.*]]) +; CHECK-NEXT: call void @llvm.reloc.none(metadata !"basic_impl_unknown") +; CHECK-NEXT: ret void +; + call void (ptr, ...) @partial_aspects(ptr @.str.int, i32 %arg) + ret void +} + +declare void @partial_aspects(ptr, ...) #4 + +attributes #0 = { "modular-format"="printf,1,2,basic_mod,basic_impl" } +attributes #1 = { "modular-format"="printf,1,2,float_present_mod,basic_impl,float" } +attributes #2 = { "modular-format"="printf,1,2,unknown_aspects_mod,basic_impl,unknown1,unknown2" } +attributes #3 = { "modular-format"="printf,2,3,first_arg_idx_mod,basic_impl,float" } +attributes #4 = { "modular-format"="printf,1,2,multiple_aspects_mod,basic_impl,float,unknown" } diff --git a/llvm/test/Verifier/modular-format.ll b/llvm/test/Verifier/modular-format.ll new file mode 100644 index 0000000000000..abdd73d098be1 --- /dev/null +++ b/llvm/test/Verifier/modular-format.ll @@ -0,0 +1,41 @@ +; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s + +define void @test_too_few_arguments(i32 %arg, ...) "modular-format"="printf,1,2,basic_mod" { + ret void +} +; CHECK: modular-format attribute requires at least 5 arguments +; CHECK-NEXT: ptr @test_too_few_arguments + +define void @test_first_arg_index_not_integer(i32 %arg, ...) "modular-format"="printf,1,foo,basic_mod,basic_impl" { + ret void +} +; CHECK: modular-format attribute first arg index is not an integer +; CHECK-NEXT: ptr @test_first_arg_index_not_integer + +define void @test_first_arg_index_zero(i32 %arg) "modular-format"="printf,1,0,basic_mod,basic_impl" { + ret void +} +; CHECK: modular-format attribute first arg index is out of bounds +; CHECK-NEXT: ptr @test_first_arg_index_zero + +define void @test_first_arg_index_out_of_bounds(i32 %arg) "modular-format"="printf,1,2,basic_mod,basic_impl" { + ret void +} +; CHECK: modular-format attribute first arg index is out of bounds +; CHECK-NEXT: ptr @test_first_arg_index_out_of_bounds + +define void @test_first_arg_index_out_of_bounds_varargs(i32 %arg, ...) "modular-format"="printf,1,3,basic_mod,basic_impl" { + ret void +} +; CHECK: modular-format attribute first arg index is out of bounds +; CHECK-NEXT: ptr @test_first_arg_index_out_of_bounds_varargs + +; CHECK-NOT: ptr @test_first_arg_index_in_bounds +define void @test_first_arg_index_in_bounds(i32 %arg) "modular-format"="printf,1,1,basic_mod,basic_impl" { + ret void +} + +; CHECK-NOT: ptr @test_first_arg_index_in_bounds_varargs +define void @test_first_arg_index_in_bounds_varargs(i32 %arg, ...) "modular-format"="printf,1,2,basic_mod,basic_impl" { + ret void +} From 4c3e0320a103b34d6d5570da69b2fed3d9694b12 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Tue, 28 Oct 2025 14:05:21 -0700 Subject: [PATCH 38/64] [BOLT] Move call probe information to CallSiteInfo Pseudo probe matching (#100446) needs callee information for call probes. Embed call probe information (probe id, inline tree node, indirect flag) into CallSiteInfo. As a consequence: - Remove call probes from PseudoProbeInfo to avoid duplication, making it only contain block probes. - Probe grouping across inline tree nodes becomes more potent + allows to unambiguously elide block id 1 (common case). Block mask (blx) encoding becomes a low-ROI optimization and will be replaced by a more compact encoding leveraging simplified PseudoProbeInfo in #166680. The size increase is ~3% for an XL profile (461->475MB). Compact block probe encoding shrinks it by ~6%. Test Plan: updated pseudoprobe-decoding-{inline,noinline}.test Reviewers: paschalis-mpeis, ayermolo, yota9, yozhu, rafaelauler, maksfb Reviewed By: rafaelauler Pull Request: https://github.com/llvm/llvm-project/pull/165490 --- .../include/bolt/Profile/ProfileYAMLMapping.h | 26 ++--- bolt/include/bolt/Profile/YAMLProfileWriter.h | 35 +++--- bolt/lib/Profile/DataAggregator.cpp | 16 +-- bolt/lib/Profile/StaleProfileMatching.cpp | 24 +--- bolt/lib/Profile/YAMLProfileWriter.cpp | 103 ++++++++++-------- ...atch-blocks-with-pseudo-probes-inline.test | 6 +- .../X86/match-blocks-with-pseudo-probes.test | 2 +- .../test/X86/pseudoprobe-decoding-inline.test | 6 +- .../X86/pseudoprobe-decoding-noinline.test | 7 +- 9 files changed, 108 insertions(+), 117 deletions(-) diff --git a/bolt/include/bolt/Profile/ProfileYAMLMapping.h b/bolt/include/bolt/Profile/ProfileYAMLMapping.h index 41e2bd1651efd..b393c85321b7d 100644 --- a/bolt/include/bolt/Profile/ProfileYAMLMapping.h +++ b/bolt/include/bolt/Profile/ProfileYAMLMapping.h @@ -29,6 +29,10 @@ struct CallSiteInfo { uint32_t EntryDiscriminator{0}; /// multiple entry discriminator uint64_t Count{0}; uint64_t Mispreds{0}; + // Pseudo probe information, optional + uint32_t Probe{0}; + bool Indirect = false; + uint32_t InlineTreeNode{0}; bool operator==(const CallSiteInfo &Other) const { return Offset == Other.Offset && DestId == Other.DestId && @@ -63,6 +67,9 @@ template <> struct MappingTraits { YamlIO.mapOptional("disc", CSI.EntryDiscriminator, (uint32_t)0); YamlIO.mapRequired("cnt", CSI.Count); YamlIO.mapOptional("mis", CSI.Mispreds, (uint64_t)0); + YamlIO.mapOptional("pp", CSI.Probe, 0); + YamlIO.mapOptional("ppn", CSI.InlineTreeNode, 0); + YamlIO.mapOptional("ind", CSI.Indirect, false); } static const bool flow = true; @@ -95,29 +102,20 @@ template <> struct MappingTraits { namespace bolt { struct PseudoProbeInfo { - uint32_t InlineTreeIndex = 0; - uint64_t BlockMask = 0; // bitset with probe indices from 1 to 64 - std::vector BlockProbes; // block probes with indices above 64 - std::vector CallProbes; - std::vector IndCallProbes; + std::vector BlockProbes; std::vector InlineTreeNodes; bool operator==(const PseudoProbeInfo &Other) const { - return InlineTreeIndex == Other.InlineTreeIndex && - BlockProbes == Other.BlockProbes && CallProbes == Other.CallProbes && - IndCallProbes == Other.IndCallProbes; + return InlineTreeNodes == Other.InlineTreeNodes && + BlockProbes == Other.BlockProbes; } }; } // end namespace bolt template <> struct MappingTraits { static void mapping(IO &YamlIO, bolt::PseudoProbeInfo &PI) { - YamlIO.mapOptional("blx", PI.BlockMask, 0); - YamlIO.mapOptional("blk", PI.BlockProbes, std::vector()); - YamlIO.mapOptional("call", PI.CallProbes, std::vector()); - YamlIO.mapOptional("icall", PI.IndCallProbes, std::vector()); - YamlIO.mapOptional("id", PI.InlineTreeIndex, 0); - YamlIO.mapOptional("ids", PI.InlineTreeNodes, std::vector()); + YamlIO.mapOptional("blk", PI.BlockProbes, std::vector(1, 1)); + YamlIO.mapOptional("ids", PI.InlineTreeNodes, std::vector(1, 0)); } static const bool flow = true; diff --git a/bolt/include/bolt/Profile/YAMLProfileWriter.h b/bolt/include/bolt/Profile/YAMLProfileWriter.h index d4d7217464cc8..50ee78d342df8 100644 --- a/bolt/include/bolt/Profile/YAMLProfileWriter.h +++ b/bolt/include/bolt/Profile/YAMLProfileWriter.h @@ -74,25 +74,24 @@ class YAMLProfileWriter { collectInlineTree(const MCPseudoProbeDecoder &Decoder, const MCDecodedPseudoProbeInlineTree &Root); - // 0 - block probe, 1 - indirect call, 2 - direct call - using ProbeList = std::array, 3>; - using NodeIdToProbes = DenseMap; - static std::vector - convertNodeProbes(NodeIdToProbes &NodeProbes); - public: - template - static std::vector - writeBlockProbes(T Probes, const InlineTreeMapTy &InlineTreeNodeId) { - NodeIdToProbes NodeProbes; - for (const MCDecodedPseudoProbe &Probe : Probes) { - auto It = InlineTreeNodeId.find(Probe.getInlineTreeNode()); - if (It == InlineTreeNodeId.end()) - continue; - NodeProbes[It->second][Probe.getType()].emplace_back(Probe.getIndex()); - } - return convertNodeProbes(NodeProbes); - } + class BlockProbeCtx { + struct Call { + uint64_t Id; + uint32_t Node; + bool Indirect; + bool Used; + }; + // Group block probes by node id. + DenseMap> NodeToProbes; + // Offset -> call probe + DenseMap CallProbes; + + public: + void addBlockProbe(const InlineTreeMapTy &Map, + const MCDecodedPseudoProbe &Probe, uint32_t ProbeOffset); + void finalize(yaml::bolt::BinaryBasicBlockProfile &YamlBB); + }; }; } // namespace bolt } // namespace llvm diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index cafe4bfebf19d..6b969011df589 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -2397,10 +2397,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC, PseudoProbeDecoder->getAddress2ProbesMap(); BinaryFunction::FragmentsSetTy Fragments(BF->Fragments); Fragments.insert(BF); - DenseMap< - uint32_t, - std::vector>> - BlockProbes; + DenseMap BlockCtx; for (const BinaryFunction *F : Fragments) { const uint64_t FuncAddr = F->getAddress(); for (const MCDecodedPseudoProbe &Probe : @@ -2408,15 +2405,14 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC, const uint32_t OutputAddress = Probe.getAddress(); const uint32_t InputOffset = BAT->translate( FuncAddr, OutputAddress - FuncAddr, /*IsBranchSrc=*/true); - const unsigned BlockIndex = getBlock(InputOffset).second; - BlockProbes[BlockIndex].emplace_back(Probe); + const auto &[BlockOffset, BlockIndex] = getBlock(InputOffset); + BlockCtx[BlockIndex].addBlockProbe(InlineTreeNodeId, Probe, + InputOffset - BlockOffset); } } - for (auto &[Block, Probes] : BlockProbes) { - YamlBF.Blocks[Block].PseudoProbes = - YAMLProfileWriter::writeBlockProbes(Probes, InlineTreeNodeId); - } + for (auto &[Block, Ctx] : BlockCtx) + Ctx.finalize(YamlBF.Blocks[Block]); } // Skip printing if there's no profile data llvm::erase_if( diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp index 1a61949d77472..5fb65153cf313 100644 --- a/bolt/lib/Profile/StaleProfileMatching.cpp +++ b/bolt/lib/Profile/StaleProfileMatching.cpp @@ -348,26 +348,10 @@ class StaleMatcher { return It->second; }; - auto matchPseudoProbeInfo = [&](const yaml::bolt::PseudoProbeInfo - &ProfileProbe, - uint32_t NodeId) { - for (uint64_t Index = 0; Index < 64; ++Index) - if (ProfileProbe.BlockMask & 1ull << Index) - ++FlowBlockMatchCount[matchProfileProbeToBlock(NodeId, Index + 1)]; - for (const auto &ProfileProbes : - {ProfileProbe.BlockProbes, ProfileProbe.IndCallProbes, - ProfileProbe.CallProbes}) - for (uint64_t ProfileProbe : ProfileProbes) - ++FlowBlockMatchCount[matchProfileProbeToBlock(NodeId, ProfileProbe)]; - }; - - for (const yaml::bolt::PseudoProbeInfo &ProfileProbe : BlockPseudoProbes) { - if (!ProfileProbe.InlineTreeNodes.empty()) - for (uint32_t ProfileInlineTreeNode : ProfileProbe.InlineTreeNodes) - matchPseudoProbeInfo(ProfileProbe, ProfileInlineTreeNode); - else - matchPseudoProbeInfo(ProfileProbe, ProfileProbe.InlineTreeIndex); - } + for (const yaml::bolt::PseudoProbeInfo &ProfileProbe : BlockPseudoProbes) + for (uint32_t Node : ProfileProbe.InlineTreeNodes) + for (uint64_t Probe : ProfileProbe.BlockProbes) + ++FlowBlockMatchCount[matchProfileProbeToBlock(Node, Probe)]; uint32_t BestMatchCount = 0; uint32_t TotalMatchCount = 0; const FlowBlock *BestMatchBlock = nullptr; diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp index 5c631f93f01da..cd4e77b0dbb60 100644 --- a/bolt/lib/Profile/YAMLProfileWriter.cpp +++ b/bolt/lib/Profile/YAMLProfileWriter.cpp @@ -129,50 +129,62 @@ YAMLProfileWriter::convertPseudoProbeDesc(const MCPseudoProbeDecoder &Decoder) { return {Desc, InlineTree}; } -std::vector -YAMLProfileWriter::convertNodeProbes(NodeIdToProbes &NodeProbes) { - struct BlockProbeInfoHasher { - size_t operator()(const yaml::bolt::PseudoProbeInfo &BPI) const { - return llvm::hash_combine(llvm::hash_combine_range(BPI.BlockProbes), - llvm::hash_combine_range(BPI.CallProbes), - llvm::hash_combine_range(BPI.IndCallProbes)); +void YAMLProfileWriter::BlockProbeCtx::addBlockProbe( + const InlineTreeMapTy &Map, const MCDecodedPseudoProbe &Probe, + uint32_t ProbeOffset) { + auto It = Map.find(Probe.getInlineTreeNode()); + if (It == Map.end()) + return; + auto NodeId = It->second; + uint32_t Index = Probe.getIndex(); + if (Probe.isCall()) + CallProbes[ProbeOffset] = + Call{Index, NodeId, Probe.isIndirectCall(), false}; + else + NodeToProbes[NodeId].emplace_back(Index); +} + +void YAMLProfileWriter::BlockProbeCtx::finalize( + yaml::bolt::BinaryBasicBlockProfile &YamlBB) { + // Hash block probes by vector + struct ProbeHasher { + size_t operator()(const ArrayRef Probes) const { + return llvm::hash_combine_range(Probes); } }; - // Check identical BlockProbeInfo structs and merge them - std::unordered_map, - BlockProbeInfoHasher> - BPIToNodes; - for (auto &[NodeId, Probes] : NodeProbes) { - yaml::bolt::PseudoProbeInfo BPI; - BPI.BlockProbes = std::vector(Probes[0].begin(), Probes[0].end()); - BPI.IndCallProbes = std::vector(Probes[1].begin(), Probes[1].end()); - BPI.CallProbes = std::vector(Probes[2].begin(), Probes[2].end()); - BPIToNodes[BPI].push_back(NodeId); + // Check identical block probes and merge them + std::unordered_map, std::vector, ProbeHasher> + ProbesToNodes; + for (auto &[NodeId, Probes] : NodeToProbes) { + llvm::sort(Probes); + ProbesToNodes[Probes].emplace_back(NodeId); } - - auto handleMask = [](const auto &Ids, auto &Vec, auto &Mask) { - for (auto Id : Ids) - if (Id > 64) - Vec.emplace_back(Id); - else - Mask |= 1ull << (Id - 1); - }; - - // Add to YAML with merged nodes/block mask optimizations - std::vector YamlProbes; - YamlProbes.reserve(BPIToNodes.size()); - for (const auto &[BPI, Nodes] : BPIToNodes) { - auto &YamlBPI = YamlProbes.emplace_back(yaml::bolt::PseudoProbeInfo()); - YamlBPI.CallProbes = BPI.CallProbes; - YamlBPI.IndCallProbes = BPI.IndCallProbes; - if (Nodes.size() == 1) - YamlBPI.InlineTreeIndex = Nodes.front(); - else - YamlBPI.InlineTreeNodes = Nodes; - handleMask(BPI.BlockProbes, YamlBPI.BlockProbes, YamlBPI.BlockMask); + for (auto &[Probes, Nodes] : ProbesToNodes) { + llvm::sort(Nodes); + YamlBB.PseudoProbes.emplace_back( + yaml::bolt::PseudoProbeInfo{Probes, Nodes}); + } + for (yaml::bolt::CallSiteInfo &CSI : YamlBB.CallSites) { + auto It = CallProbes.find(CSI.Offset); + if (It == CallProbes.end()) + continue; + Call &Probe = It->second; + CSI.Probe = Probe.Id; + CSI.InlineTreeNode = Probe.Node; + CSI.Indirect = Probe.Indirect; + Probe.Used = true; + } + for (const auto &[Offset, Probe] : CallProbes) { + if (Probe.Used) + continue; + yaml::bolt::CallSiteInfo CSI; + CSI.Offset = Offset; + CSI.Probe = Probe.Id; + CSI.InlineTreeNode = Probe.Node; + CSI.Indirect = Probe.Indirect; + YamlBB.CallSites.emplace_back(CSI); } - return YamlProbes; } std::tuple, @@ -343,12 +355,13 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS, const AddressProbesMap &ProbeMap = PseudoProbeDecoder->getAddress2ProbesMap(); const uint64_t FuncAddr = BF.getAddress(); - const std::pair &BlockRange = - BB->getInputAddressRange(); - const std::pair BlockAddrRange = { - FuncAddr + BlockRange.first, FuncAddr + BlockRange.second}; - auto Probes = ProbeMap.find(BlockAddrRange.first, BlockAddrRange.second); - YamlBB.PseudoProbes = writeBlockProbes(Probes, InlineTreeNodeId); + auto [Start, End] = BB->getInputAddressRange(); + Start += FuncAddr; + End += FuncAddr; + BlockProbeCtx Ctx; + for (const MCDecodedPseudoProbe &Probe : ProbeMap.find(Start, End)) + Ctx.addBlockProbe(InlineTreeNodeId, Probe, Probe.getAddress() - Start); + Ctx.finalize(YamlBB); } YamlBF.Blocks.emplace_back(YamlBB); diff --git a/bolt/test/X86/match-blocks-with-pseudo-probes-inline.test b/bolt/test/X86/match-blocks-with-pseudo-probes-inline.test index accb4742851ea..9224cf163dbcc 100644 --- a/bolt/test/X86/match-blocks-with-pseudo-probes-inline.test +++ b/bolt/test/X86/match-blocks-with-pseudo-probes-inline.test @@ -30,7 +30,7 @@ functions: insns: 11 hash: 0x1 exec: 1 - probes: [ { blx: 9 } ] + probes: [ { blk: [ 1, 4 ] } ] inline_tree: [ { } ] - name: foo fid: 10 @@ -43,7 +43,7 @@ functions: hash: 0x2 exec: 1 succ: [ { bid: 3, cnt: 0 } ] - probes: [ { blx: 3 } ] + probes: [ { blk: [ 1, 2 ] } ] inline_tree: [ { g: 1 }, { g: 0, cs: 8 } ] - name: main fid: 11 @@ -56,7 +56,7 @@ functions: hash: 0x3 exec: 1 succ: [ { bid: 3, cnt: 0 } ] - probes: [ { blx: 3, id: 1 }, { blx: 1 } ] + probes: [ { blk: [ 1, 2 ], ids: [ 1 ] }, { blk: [ 1 ] } ] inline_tree: [ { g: 2 }, { g: 1, cs: 2 }, { g: 0, p: 1, cs: 8 } ] pseudo_probe_desc: gs: [ 0xE413754A191DB537, 0x5CF8C24CDB18BDAC, 0xDB956436E78DD5FA ] diff --git a/bolt/test/X86/match-blocks-with-pseudo-probes.test b/bolt/test/X86/match-blocks-with-pseudo-probes.test index 40cb64ee82919..7be327d698b17 100644 --- a/bolt/test/X86/match-blocks-with-pseudo-probes.test +++ b/bolt/test/X86/match-blocks-with-pseudo-probes.test @@ -55,7 +55,7 @@ functions: hash: 0xFFFFFFFFFFFFFFF1 insns: 1 succ: [ { bid: 3, cnt: 1} ] - probes: [ { blx: 1 } ] + probes: [ { blk: [ 1 ] } ] inline_tree: [ { g: 0 } ] pseudo_probe_desc: gs: [ 0xDB956436E78DD5FA ] diff --git a/bolt/test/X86/pseudoprobe-decoding-inline.test b/bolt/test/X86/pseudoprobe-decoding-inline.test index e5e8aadc18f9e..9748fc1b6a4d4 100644 --- a/bolt/test/X86/pseudoprobe-decoding-inline.test +++ b/bolt/test/X86/pseudoprobe-decoding-inline.test @@ -14,17 +14,17 @@ # RUN: FileCheck --input-file %t.yaml2 %s --check-prefix CHECK-YAML # CHECK-YAML: name: bar # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 9 } ] +# CHECK-YAML: probes: [ { blk: [ 1, 4 ] } ] # CHECK-YAML: inline_tree: [ { } ] # # CHECK-YAML: name: foo # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 3 } ] +# CHECK-YAML: probes: [ { blk: [ 1, 2 ] } ] # CHECK-YAML: inline_tree: [ { g: 1 }, { g: 0, cs: 8 } ] # # CHECK-YAML: name: main # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 3, id: 1 }, { blx: 1 } ] +# CHECK-YAML: probes: [ { blk: [ 1, 2 ], ids: [ 1 ] }, { } ] # CHECK-YAML: inline_tree: [ { g: 2 }, { g: 1, cs: 2 }, { g: 0, p: 1, cs: 8 } ] # # CHECK-YAML: pseudo_probe_desc: diff --git a/bolt/test/X86/pseudoprobe-decoding-noinline.test b/bolt/test/X86/pseudoprobe-decoding-noinline.test index 36a2fab74e857..4ba51cdc96f9e 100644 --- a/bolt/test/X86/pseudoprobe-decoding-noinline.test +++ b/bolt/test/X86/pseudoprobe-decoding-noinline.test @@ -15,17 +15,18 @@ # RUN: FileCheck --input-file %t.yaml2 %s --check-prefix CHECK-YAML # CHECK-YAML: name: bar # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 9 } ] +# CHECK-YAML: probes: [ { blk: [ 1, 4 ] } ] # CHECK-YAML: inline_tree: [ { } ] # # CHECK-YAML: name: foo # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 3 } ] +# CHECK-YAML: probes: [ { blk: [ 1, 2 ] } ] # CHECK-YAML: inline_tree: [ { g: 2 } ] # # CHECK-YAML: name: main # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 1, call: [ 2 ] } ] +# CHECK-YAML: calls: [ { off: 0x4, fid: 0, cnt: 0, pp: 2 } ] +# CHECK-YAML: probes: [ { } ] # CHECK-YAML: inline_tree: [ { g: 1 } ] # # CHECK-YAML: pseudo_probe_desc: From 9fac225e22a95aa1866e6595d72ef4ce663a6b14 Mon Sep 17 00:00:00 2001 From: David Pagan Date: Tue, 11 Nov 2025 11:59:18 -0800 Subject: [PATCH 39/64] [clang][OpenMP] 6.0: Add default clause support for 'target' directive (#162910) Per OpenMP 6.0 specification, section 7.5.1, default Clause Page 224, lines 3-5 default Clause, Semantics If data-sharing-attribute is shared then the clause has no effect on a target construct; otherwise, its effect on a target construct is equivalent to specifying the defaultmap clause with the same data-sharing-attribute and variable-category. Testing: OpenMP LIT tests check-all --- clang/docs/OpenMPSupport.rst | 2 +- clang/docs/ReleaseNotes.rst | 1 + clang/lib/Sema/SemaOpenMP.cpp | 126 +- clang/test/OpenMP/target_default_codegen.cpp | 2020 +++++++++++++++++ clang/test/OpenMP/target_default_messages.cpp | 2 + 5 files changed, 2115 insertions(+), 36 deletions(-) create mode 100644 clang/test/OpenMP/target_default_codegen.cpp diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst index 10a8d095fede3..f7e6061044c6d 100644 --- a/clang/docs/OpenMPSupport.rst +++ b/clang/docs/OpenMPSupport.rst @@ -559,7 +559,7 @@ implementation. +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | Clarifications to Fortran map semantics | :none:`unclaimed` | :none:`unclaimed` | | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| default clause at target construct | :part:`In Progress` | :none:`unclaimed` | | +| default clause at target construct | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/162910 | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | ref count update use_device_{ptr, addr} | :none:`unclaimed` | :none:`unclaimed` | | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 5b95b44ea9450..88a05affebf9e 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -725,6 +725,7 @@ OpenMP Support - Added support for 'omp fuse' directive. - Updated parsing and semantic analysis support for ``nowait`` clause to accept optional argument in OpenMP >= 60. +- Added support for ``default`` clause on ``target`` directive. Improvements ^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 465dab2b65eb1..2ab2fd10a942e 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -17319,45 +17319,101 @@ OMPClause *SemaOpenMP::ActOnOpenMPDefaultClause( << getOpenMPClauseNameForDiag(OMPC_default); return nullptr; } - - switch (M) { - case OMP_DEFAULT_none: - DSAStack->setDefaultDSANone(MLoc); - break; - case OMP_DEFAULT_shared: - DSAStack->setDefaultDSAShared(MLoc); - break; - case OMP_DEFAULT_firstprivate: - DSAStack->setDefaultDSAFirstPrivate(MLoc); - break; - case OMP_DEFAULT_private: - DSAStack->setDefaultDSAPrivate(MLoc); - break; - default: - llvm_unreachable("DSA unexpected in OpenMP default clause"); - } - - switch (VCKind) { - case OMPC_DEFAULT_VC_aggregate: - DSAStack->setDefaultDSAVCAggregate(VCKindLoc); - break; - case OMPC_DEFAULT_VC_all: - DSAStack->setDefaultDSAVCAll(VCKindLoc); - break; - case OMPC_DEFAULT_VC_allocatable: - DSAStack->setDefaultDSAVCAllocatable(VCKindLoc); - break; - case OMPC_DEFAULT_VC_pointer: - DSAStack->setDefaultDSAVCPointer(VCKindLoc); - break; - case OMPC_DEFAULT_VC_scalar: - DSAStack->setDefaultDSAVCScalar(VCKindLoc); - break; - default: + if (VCKind == OMPC_DEFAULT_VC_unknown) { Diag(VCKindLoc, diag::err_omp_default_vc) << getOpenMPSimpleClauseTypeName(OMPC_default, unsigned(M)); + return nullptr; } + bool IsTargetDefault = + getLangOpts().OpenMP >= 60 && + isOpenMPTargetExecutionDirective(DSAStack->getCurrentDirective()); + + // OpenMP 6.0, page 224, lines 3-4 default Clause, Semantics + // If data-sharing-attribute is shared then the clause has no effect + // on a target construct; + if (IsTargetDefault && M == OMP_DEFAULT_shared) + return nullptr; + + auto SetDefaultClauseAttrs = [&](llvm::omp::DefaultKind M, + OpenMPDefaultClauseVariableCategory VCKind) { + OpenMPDefaultmapClauseModifier DefMapMod; + OpenMPDefaultmapClauseKind DefMapKind; + // default data-sharing-attribute + switch (M) { + case OMP_DEFAULT_none: + if (IsTargetDefault) + DefMapMod = OMPC_DEFAULTMAP_MODIFIER_none; + else + DSAStack->setDefaultDSANone(MLoc); + break; + case OMP_DEFAULT_firstprivate: + if (IsTargetDefault) + DefMapMod = OMPC_DEFAULTMAP_MODIFIER_firstprivate; + else + DSAStack->setDefaultDSAFirstPrivate(MLoc); + break; + case OMP_DEFAULT_private: + if (IsTargetDefault) + DefMapMod = OMPC_DEFAULTMAP_MODIFIER_private; + else + DSAStack->setDefaultDSAPrivate(MLoc); + break; + case OMP_DEFAULT_shared: + assert(!IsTargetDefault && "DSA shared invalid with target directive"); + DSAStack->setDefaultDSAShared(MLoc); + break; + default: + llvm_unreachable("unexpected DSA in OpenMP default clause"); + } + // default variable-category + switch (VCKind) { + case OMPC_DEFAULT_VC_aggregate: + if (IsTargetDefault) + DefMapKind = OMPC_DEFAULTMAP_aggregate; + else + DSAStack->setDefaultDSAVCAggregate(VCKindLoc); + break; + case OMPC_DEFAULT_VC_pointer: + if (IsTargetDefault) + DefMapKind = OMPC_DEFAULTMAP_pointer; + else + DSAStack->setDefaultDSAVCPointer(VCKindLoc); + break; + case OMPC_DEFAULT_VC_scalar: + if (IsTargetDefault) + DefMapKind = OMPC_DEFAULTMAP_scalar; + else + DSAStack->setDefaultDSAVCScalar(VCKindLoc); + break; + case OMPC_DEFAULT_VC_all: + if (IsTargetDefault) + DefMapKind = OMPC_DEFAULTMAP_all; + else + DSAStack->setDefaultDSAVCAll(VCKindLoc); + break; + default: + llvm_unreachable("unexpected variable category in OpenMP default clause"); + } + // OpenMP 6.0, page 224, lines 4-5 default Clause, Semantics + // otherwise, its effect on a target construct is equivalent to + // specifying the defaultmap clause with the same data-sharing-attribute + // and variable-category. + // + // If earlier than OpenMP 6.0, or not a target directive, the default DSA + // is/was set as before. + if (IsTargetDefault) { + if (DefMapKind == OMPC_DEFAULTMAP_all) { + DSAStack->setDefaultDMAAttr(DefMapMod, OMPC_DEFAULTMAP_aggregate, MLoc); + DSAStack->setDefaultDMAAttr(DefMapMod, OMPC_DEFAULTMAP_scalar, MLoc); + DSAStack->setDefaultDMAAttr(DefMapMod, OMPC_DEFAULTMAP_pointer, MLoc); + } else { + DSAStack->setDefaultDMAAttr(DefMapMod, DefMapKind, MLoc); + } + } + }; + + SetDefaultClauseAttrs(M, VCKind); return new (getASTContext()) OMPDefaultClause(M, MLoc, VCKind, VCKindLoc, StartLoc, LParenLoc, EndLoc); } diff --git a/clang/test/OpenMP/target_default_codegen.cpp b/clang/test/OpenMP/target_default_codegen.cpp new file mode 100644 index 0000000000000..eadd0e57945b1 --- /dev/null +++ b/clang/test/OpenMP/target_default_codegen.cpp @@ -0,0 +1,2020 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 5 +// expected-no-diagnostics + +// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -Wno-vla -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix CK-64 +// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix CK-64 +// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -Wno-vla -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix CK-32 +// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix CK-32 + +// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -Wno-vla -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY-64 %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY-64 %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -Wno-vla -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY-32 %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY-32 %s + +#ifndef HEADER +#define HEADER +void foo1(int a) { + double d = (double)a; + + #pragma omp target default(private: scalar) + { + d += 1.0; + } +} + +void foo2() { + int pvtArr[10]; + + #pragma omp target default(private: aggregate) + { + pvtArr[5]++; + } +} + +void foo3() { + int *pa; + + #pragma omp target default(private: pointer) + { + pa[50]++; + } +} + +// Specified variable-category doesn't apply to referenced variable, so +// normal implicitly determined data-sharing applies. +void foo4() { + int p; + + #pragma omp target default(private: pointer) + { + p++; + } +} + +// Verify default clause with variable-category 'all' is equivalent to no +// variable-category. IR checks generated with 'all' but test runs without +// variable-category. +void foo5(int a) { + double d = (double)a; + int pvtArr[10]; + int *pa; + + #pragma omp target default(private) + { + d += 1.0; + pvtArr[5]++; + pa[50]++; + } +} + +// Verify default clause with 'shared' DSA is ignored. This makes it +// equivalent to target with no default clause. IR checks generated with +// no default clause but test runs with default 'shared'. +void foo6(int a) { + double d = (double)a; + int pvtArr[10]; + int *pa; + + #pragma omp target default(shared) + { + d += 1.0; + pvtArr[5]++; + pa[50]++; + } +} + +// Verify default clause with 'firstprivate' DSA is equivalent to specifying +// defaultmap with 'firstprivate'. IR checks generated with +// defaultmap(firstprivate) but test runs with default(firstprivate). +void foo7(int a) { + double d = (double)a; + int pvtArr[10]; + int *pa; + + #pragma omp target default(firstprivate) + { + d += 1.0; + pvtArr[5]++; + pa[50]++; + } +} + +// Verify 'default' clause on a combined 'target' directive is equivalent to +// specifying its constituent directives with 'default' clauses. IR checks +// generated with constituent directives but test runs with combined +// directive. +void foo8() { + int x = 0; + #pragma omp target teams distribute parallel for default(firstprivate) firstprivate(x) + for (int i=0; i<10; i++) + x += 1; +} +#endif // HEADER +// CK-64-LABEL: define dso_local void @_Z4foo1i( +// CK-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[D:%.*]] = alloca double, align 8 +// CK-64-NEXT: [[D_CASTED:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-64-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CK-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CK-64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// CK-64-NEXT: store double [[CONV]], ptr [[D]], align 8 +// CK-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D]], align 8 +// CK-64-NEXT: store double [[TMP1]], ptr [[D_CASTED]], align 8 +// CK-64-NEXT: [[TMP2:%.*]] = load i64, ptr [[D_CASTED]], align 8 +// CK-64-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP2]], ptr [[TMP3]], align 8 +// CK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP2]], ptr [[TMP4]], align 8 +// CK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CK-64-NEXT: store ptr null, ptr [[TMP5]], align 8 +// CK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-64-NEXT: store i32 3, ptr [[TMP8]], align 4 +// CK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-64-NEXT: store i32 1, ptr [[TMP9]], align 4 +// CK-64-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP6]], ptr [[TMP10]], align 8 +// CK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-64-NEXT: store ptr [[TMP7]], ptr [[TMP11]], align 8 +// CK-64-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-64-NEXT: store ptr @.offload_sizes, ptr [[TMP12]], align 8 +// CK-64-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-64-NEXT: store ptr @.offload_maptypes, ptr [[TMP13]], align 8 +// CK-64-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-64-NEXT: store ptr null, ptr [[TMP14]], align 8 +// CK-64-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-64-NEXT: store ptr null, ptr [[TMP15]], align 8 +// CK-64-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-64-NEXT: store i64 0, ptr [[TMP16]], align 8 +// CK-64-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-64-NEXT: store i64 0, ptr [[TMP17]], align 8 +// CK-64-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-64-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP18]], align 4 +// CK-64-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-64-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4 +// CK-64-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-64-NEXT: store i32 0, ptr [[TMP20]], align 4 +// CK-64-NEXT: [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23.region_id, ptr [[KERNEL_ARGS]]) +// CK-64-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0 +// CK-64-NEXT: br i1 [[TMP22]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-64: [[OMP_OFFLOAD_FAILED]]: +// CK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23(i64 [[TMP2]]) #[[ATTR2:[0-9]+]] +// CK-64-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-64: [[OMP_OFFLOAD_CONT]]: +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23( +// CK-64-SAME: i64 [[D:%.*]]) #[[ATTR1:[0-9]+]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[D1:%.*]] = alloca double, align 8 +// CK-64-NEXT: store i64 [[D]], ptr [[D_ADDR]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = load double, ptr [[D1]], align 8 +// CK-64-NEXT: [[ADD:%.*]] = fadd double [[TMP0]], 1.000000e+00 +// CK-64-NEXT: store double [[ADD]], ptr [[D1]], align 8 +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define dso_local void @_Z4foo2v( +// CK-64-SAME: ) #[[ATTR0]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// CK-64-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[TMP0]], align 8 +// CK-64-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[TMP1]], align 8 +// CK-64-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CK-64-NEXT: store ptr null, ptr [[TMP2]], align 8 +// CK-64-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-64-NEXT: store i32 3, ptr [[TMP5]], align 4 +// CK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-64-NEXT: store i32 1, ptr [[TMP6]], align 4 +// CK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 8 +// CK-64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-64-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 8 +// CK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-64-NEXT: store ptr @.offload_sizes.1, ptr [[TMP9]], align 8 +// CK-64-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-64-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP10]], align 8 +// CK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-64-NEXT: store ptr null, ptr [[TMP11]], align 8 +// CK-64-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-64-NEXT: store ptr null, ptr [[TMP12]], align 8 +// CK-64-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-64-NEXT: store i64 0, ptr [[TMP13]], align 8 +// CK-64-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-64-NEXT: store i64 0, ptr [[TMP14]], align 8 +// CK-64-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-64-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP15]], align 4 +// CK-64-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-64-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4 +// CK-64-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-64-NEXT: store i32 0, ptr [[TMP17]], align 4 +// CK-64-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32.region_id, ptr [[KERNEL_ARGS]]) +// CK-64-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 +// CK-64-NEXT: br i1 [[TMP19]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-64: [[OMP_OFFLOAD_FAILED]]: +// CK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32(ptr [[PVTARR]]) #[[ATTR2]] +// CK-64-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-64: [[OMP_OFFLOAD_CONT]]: +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32( +// CK-64-SAME: ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]]) #[[ATTR1]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[PVTARR_ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[PVTARR1:%.*]] = alloca [10 x i32], align 4 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 8, !nonnull [[META19:![0-9]+]], !align [[META20:![0-9]+]] +// CK-64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i64 0, i64 5 +// CK-64-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CK-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CK-64-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define dso_local void @_Z4foo3v( +// CK-64-SAME: ) #[[ATTR0]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[PA:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PA]], align 8 +// CK-64-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: store ptr [[TMP0]], ptr [[TMP1]], align 8 +// CK-64-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: store ptr [[TMP0]], ptr [[TMP2]], align 8 +// CK-64-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CK-64-NEXT: store ptr null, ptr [[TMP3]], align 8 +// CK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-64-NEXT: store i32 3, ptr [[TMP6]], align 4 +// CK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-64-NEXT: store i32 1, ptr [[TMP7]], align 4 +// CK-64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 8 +// CK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-64-NEXT: store ptr [[TMP5]], ptr [[TMP9]], align 8 +// CK-64-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-64-NEXT: store ptr @.offload_sizes.3, ptr [[TMP10]], align 8 +// CK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-64-NEXT: store ptr @.offload_maptypes.4, ptr [[TMP11]], align 8 +// CK-64-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-64-NEXT: store ptr null, ptr [[TMP12]], align 8 +// CK-64-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-64-NEXT: store ptr null, ptr [[TMP13]], align 8 +// CK-64-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-64-NEXT: store i64 0, ptr [[TMP14]], align 8 +// CK-64-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-64-NEXT: store i64 0, ptr [[TMP15]], align 8 +// CK-64-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-64-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4 +// CK-64-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-64-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4 +// CK-64-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-64-NEXT: store i32 0, ptr [[TMP18]], align 4 +// CK-64-NEXT: [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41.region_id, ptr [[KERNEL_ARGS]]) +// CK-64-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CK-64-NEXT: br i1 [[TMP20]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-64: [[OMP_OFFLOAD_FAILED]]: +// CK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41(ptr [[TMP0]]) #[[ATTR2]] +// CK-64-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-64: [[OMP_OFFLOAD_CONT]]: +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41( +// CK-64-SAME: ptr [[PA:%.*]]) #[[ATTR1]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[PA_ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[PA1:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: store ptr [[PA]], ptr [[PA_ADDR]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PA1]], align 8 +// CK-64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 50 +// CK-64-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CK-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CK-64-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define dso_local void @_Z4foo4v( +// CK-64-SAME: ) #[[ATTR0]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[P:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[P_CASTED:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[P]], align 4 +// CK-64-NEXT: store i32 [[TMP0]], ptr [[P_CASTED]], align 4 +// CK-64-NEXT: [[TMP1:%.*]] = load i64, ptr [[P_CASTED]], align 8 +// CK-64-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP1]], ptr [[TMP2]], align 8 +// CK-64-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP1]], ptr [[TMP3]], align 8 +// CK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CK-64-NEXT: store ptr null, ptr [[TMP4]], align 8 +// CK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-64-NEXT: store i32 3, ptr [[TMP7]], align 4 +// CK-64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-64-NEXT: store i32 1, ptr [[TMP8]], align 4 +// CK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP5]], ptr [[TMP9]], align 8 +// CK-64-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-64-NEXT: store ptr [[TMP6]], ptr [[TMP10]], align 8 +// CK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-64-NEXT: store ptr @.offload_sizes.5, ptr [[TMP11]], align 8 +// CK-64-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-64-NEXT: store ptr @.offload_maptypes.6, ptr [[TMP12]], align 8 +// CK-64-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-64-NEXT: store ptr null, ptr [[TMP13]], align 8 +// CK-64-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-64-NEXT: store ptr null, ptr [[TMP14]], align 8 +// CK-64-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-64-NEXT: store i64 0, ptr [[TMP15]], align 8 +// CK-64-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-64-NEXT: store i64 0, ptr [[TMP16]], align 8 +// CK-64-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-64-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP17]], align 4 +// CK-64-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-64-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4 +// CK-64-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-64-NEXT: store i32 0, ptr [[TMP19]], align 4 +// CK-64-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52.region_id, ptr [[KERNEL_ARGS]]) +// CK-64-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 +// CK-64-NEXT: br i1 [[TMP21]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-64: [[OMP_OFFLOAD_FAILED]]: +// CK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52(i64 [[TMP1]]) #[[ATTR2]] +// CK-64-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-64: [[OMP_OFFLOAD_CONT]]: +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52( +// CK-64-SAME: i64 [[P:%.*]]) #[[ATTR1]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[P_ADDR:%.*]] = alloca i64, align 8 +// CK-64-NEXT: store i64 [[P]], ptr [[P_ADDR]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[P_ADDR]], align 4 +// CK-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +// CK-64-NEXT: store i32 [[INC]], ptr [[P_ADDR]], align 4 +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define dso_local void @_Z4foo5i( +// CK-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[D:%.*]] = alloca double, align 8 +// CK-64-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// CK-64-NEXT: [[PA:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[D_CASTED:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8 +// CK-64-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-64-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CK-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CK-64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// CK-64-NEXT: store double [[CONV]], ptr [[D]], align 8 +// CK-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D]], align 8 +// CK-64-NEXT: store double [[TMP1]], ptr [[D_CASTED]], align 8 +// CK-64-NEXT: [[TMP2:%.*]] = load i64, ptr [[D_CASTED]], align 8 +// CK-64-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA]], align 8 +// CK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP2]], ptr [[TMP4]], align 8 +// CK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP2]], ptr [[TMP5]], align 8 +// CK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CK-64-NEXT: store ptr null, ptr [[TMP6]], align 8 +// CK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[TMP7]], align 8 +// CK-64-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[TMP8]], align 8 +// CK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1 +// CK-64-NEXT: store ptr null, ptr [[TMP9]], align 8 +// CK-64-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP3]], ptr [[TMP10]], align 8 +// CK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP3]], ptr [[TMP11]], align 8 +// CK-64-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2 +// CK-64-NEXT: store ptr null, ptr [[TMP12]], align 8 +// CK-64-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-64-NEXT: store i32 3, ptr [[TMP15]], align 4 +// CK-64-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-64-NEXT: store i32 3, ptr [[TMP16]], align 4 +// CK-64-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP13]], ptr [[TMP17]], align 8 +// CK-64-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-64-NEXT: store ptr [[TMP14]], ptr [[TMP18]], align 8 +// CK-64-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-64-NEXT: store ptr @.offload_sizes.7, ptr [[TMP19]], align 8 +// CK-64-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-64-NEXT: store ptr @.offload_maptypes.8, ptr [[TMP20]], align 8 +// CK-64-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-64-NEXT: store ptr null, ptr [[TMP21]], align 8 +// CK-64-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-64-NEXT: store ptr null, ptr [[TMP22]], align 8 +// CK-64-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-64-NEXT: store i64 0, ptr [[TMP23]], align 8 +// CK-64-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-64-NEXT: store i64 0, ptr [[TMP24]], align 8 +// CK-64-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-64-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP25]], align 4 +// CK-64-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-64-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4 +// CK-64-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-64-NEXT: store i32 0, ptr [[TMP27]], align 4 +// CK-64-NEXT: [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66.region_id, ptr [[KERNEL_ARGS]]) +// CK-64-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CK-64-NEXT: br i1 [[TMP29]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-64: [[OMP_OFFLOAD_FAILED]]: +// CK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66(i64 [[TMP2]], ptr [[PVTARR]], ptr [[TMP3]]) #[[ATTR2]] +// CK-64-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-64: [[OMP_OFFLOAD_CONT]]: +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66( +// CK-64-SAME: i64 [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[PVTARR_ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[PA_ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[D1:%.*]] = alloca double, align 8 +// CK-64-NEXT: [[PVTARR2:%.*]] = alloca [10 x i32], align 4 +// CK-64-NEXT: [[PA3:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: store i64 [[D]], ptr [[D_ADDR]], align 8 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 8 +// CK-64-NEXT: store ptr [[PA]], ptr [[PA_ADDR]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 8, !nonnull [[META19]], !align [[META20]] +// CK-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D1]], align 8 +// CK-64-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// CK-64-NEXT: store double [[ADD]], ptr [[D1]], align 8 +// CK-64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR2]], i64 0, i64 5 +// CK-64-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CK-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// CK-64-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CK-64-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA3]], align 8 +// CK-64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50 +// CK-64-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4 +// CK-64-NEXT: [[INC5:%.*]] = add nsw i32 [[TMP4]], 1 +// CK-64-NEXT: store i32 [[INC5]], ptr [[ARRAYIDX4]], align 4 +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define dso_local void @_Z4foo6i( +// CK-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[D:%.*]] = alloca double, align 8 +// CK-64-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// CK-64-NEXT: [[PA:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[D_CASTED:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8 +// CK-64-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-64-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CK-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CK-64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// CK-64-NEXT: store double [[CONV]], ptr [[D]], align 8 +// CK-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D]], align 8 +// CK-64-NEXT: store double [[TMP1]], ptr [[D_CASTED]], align 8 +// CK-64-NEXT: [[TMP2:%.*]] = load i64, ptr [[D_CASTED]], align 8 +// CK-64-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA]], align 8 +// CK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP2]], ptr [[TMP4]], align 8 +// CK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP2]], ptr [[TMP5]], align 8 +// CK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CK-64-NEXT: store ptr null, ptr [[TMP6]], align 8 +// CK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[TMP7]], align 8 +// CK-64-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[TMP8]], align 8 +// CK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1 +// CK-64-NEXT: store ptr null, ptr [[TMP9]], align 8 +// CK-64-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP3]], ptr [[TMP10]], align 8 +// CK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP3]], ptr [[TMP11]], align 8 +// CK-64-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2 +// CK-64-NEXT: store ptr null, ptr [[TMP12]], align 8 +// CK-64-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-64-NEXT: store i32 3, ptr [[TMP15]], align 4 +// CK-64-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-64-NEXT: store i32 3, ptr [[TMP16]], align 4 +// CK-64-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP13]], ptr [[TMP17]], align 8 +// CK-64-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-64-NEXT: store ptr [[TMP14]], ptr [[TMP18]], align 8 +// CK-64-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-64-NEXT: store ptr @.offload_sizes.9, ptr [[TMP19]], align 8 +// CK-64-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-64-NEXT: store ptr @.offload_maptypes.10, ptr [[TMP20]], align 8 +// CK-64-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-64-NEXT: store ptr null, ptr [[TMP21]], align 8 +// CK-64-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-64-NEXT: store ptr null, ptr [[TMP22]], align 8 +// CK-64-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-64-NEXT: store i64 0, ptr [[TMP23]], align 8 +// CK-64-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-64-NEXT: store i64 0, ptr [[TMP24]], align 8 +// CK-64-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-64-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP25]], align 4 +// CK-64-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-64-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4 +// CK-64-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-64-NEXT: store i32 0, ptr [[TMP27]], align 4 +// CK-64-NEXT: [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82.region_id, ptr [[KERNEL_ARGS]]) +// CK-64-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CK-64-NEXT: br i1 [[TMP29]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-64: [[OMP_OFFLOAD_FAILED]]: +// CK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82(i64 [[TMP2]], ptr [[PVTARR]], ptr [[TMP3]]) #[[ATTR2]] +// CK-64-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-64: [[OMP_OFFLOAD_CONT]]: +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82( +// CK-64-SAME: i64 [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[PVTARR_ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[PA_ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: store i64 [[D]], ptr [[D_ADDR]], align 8 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 8 +// CK-64-NEXT: store ptr [[PA]], ptr [[PA_ADDR]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 8, !nonnull [[META19]], !align [[META20]] +// CK-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D_ADDR]], align 8 +// CK-64-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// CK-64-NEXT: store double [[ADD]], ptr [[D_ADDR]], align 8 +// CK-64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 5 +// CK-64-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CK-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// CK-64-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CK-64-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA_ADDR]], align 8 +// CK-64-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50 +// CK-64-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +// CK-64-NEXT: [[INC2:%.*]] = add nsw i32 [[TMP4]], 1 +// CK-64-NEXT: store i32 [[INC2]], ptr [[ARRAYIDX1]], align 4 +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define dso_local void @_Z4foo7i( +// CK-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[D:%.*]] = alloca double, align 8 +// CK-64-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// CK-64-NEXT: [[PA:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[D_CASTED:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8 +// CK-64-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-64-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CK-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CK-64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// CK-64-NEXT: store double [[CONV]], ptr [[D]], align 8 +// CK-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D]], align 8 +// CK-64-NEXT: store double [[TMP1]], ptr [[D_CASTED]], align 8 +// CK-64-NEXT: [[TMP2:%.*]] = load i64, ptr [[D_CASTED]], align 8 +// CK-64-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA]], align 8 +// CK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP2]], ptr [[TMP4]], align 8 +// CK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP2]], ptr [[TMP5]], align 8 +// CK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CK-64-NEXT: store ptr null, ptr [[TMP6]], align 8 +// CK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[TMP7]], align 8 +// CK-64-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[TMP8]], align 8 +// CK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1 +// CK-64-NEXT: store ptr null, ptr [[TMP9]], align 8 +// CK-64-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP3]], ptr [[TMP10]], align 8 +// CK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP3]], ptr [[TMP11]], align 8 +// CK-64-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2 +// CK-64-NEXT: store ptr null, ptr [[TMP12]], align 8 +// CK-64-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-64-NEXT: store i32 3, ptr [[TMP15]], align 4 +// CK-64-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-64-NEXT: store i32 3, ptr [[TMP16]], align 4 +// CK-64-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP13]], ptr [[TMP17]], align 8 +// CK-64-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-64-NEXT: store ptr [[TMP14]], ptr [[TMP18]], align 8 +// CK-64-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-64-NEXT: store ptr @.offload_sizes.11, ptr [[TMP19]], align 8 +// CK-64-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-64-NEXT: store ptr @.offload_maptypes.12, ptr [[TMP20]], align 8 +// CK-64-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-64-NEXT: store ptr null, ptr [[TMP21]], align 8 +// CK-64-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-64-NEXT: store ptr null, ptr [[TMP22]], align 8 +// CK-64-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-64-NEXT: store i64 0, ptr [[TMP23]], align 8 +// CK-64-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-64-NEXT: store i64 0, ptr [[TMP24]], align 8 +// CK-64-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-64-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP25]], align 4 +// CK-64-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-64-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4 +// CK-64-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-64-NEXT: store i32 0, ptr [[TMP27]], align 4 +// CK-64-NEXT: [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98.region_id, ptr [[KERNEL_ARGS]]) +// CK-64-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CK-64-NEXT: br i1 [[TMP29]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-64: [[OMP_OFFLOAD_FAILED]]: +// CK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98(i64 [[TMP2]], ptr [[PVTARR]], ptr [[TMP3]]) #[[ATTR2]] +// CK-64-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-64: [[OMP_OFFLOAD_CONT]]: +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98( +// CK-64-SAME: i64 [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[PVTARR_ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[PA_ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[PVTARR1:%.*]] = alloca [10 x i32], align 4 +// CK-64-NEXT: store i64 [[D]], ptr [[D_ADDR]], align 8 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 8 +// CK-64-NEXT: store ptr [[PA]], ptr [[PA_ADDR]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 8, !nonnull [[META19]], !align [[META20]] +// CK-64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[PVTARR1]], ptr align 4 [[TMP0]], i64 40, i1 false) +// CK-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D_ADDR]], align 8 +// CK-64-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// CK-64-NEXT: store double [[ADD]], ptr [[D_ADDR]], align 8 +// CK-64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i64 0, i64 5 +// CK-64-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CK-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// CK-64-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CK-64-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA_ADDR]], align 8 +// CK-64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50 +// CK-64-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +// CK-64-NEXT: [[INC3:%.*]] = add nsw i32 [[TMP4]], 1 +// CK-64-NEXT: store i32 [[INC3]], ptr [[ARRAYIDX2]], align 4 +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define dso_local void @_Z4foo8v( +// CK-64-SAME: ) #[[ATTR0]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[X:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[X_CASTED:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-64-NEXT: store i32 0, ptr [[X]], align 4 +// CK-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4 +// CK-64-NEXT: store i32 [[TMP0]], ptr [[X_CASTED]], align 4 +// CK-64-NEXT: [[TMP1:%.*]] = load i64, ptr [[X_CASTED]], align 8 +// CK-64-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP1]], ptr [[TMP2]], align 8 +// CK-64-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP1]], ptr [[TMP3]], align 8 +// CK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CK-64-NEXT: store ptr null, ptr [[TMP4]], align 8 +// CK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-64-NEXT: store i32 3, ptr [[TMP7]], align 4 +// CK-64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-64-NEXT: store i32 1, ptr [[TMP8]], align 4 +// CK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP5]], ptr [[TMP9]], align 8 +// CK-64-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-64-NEXT: store ptr [[TMP6]], ptr [[TMP10]], align 8 +// CK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-64-NEXT: store ptr @.offload_sizes.13, ptr [[TMP11]], align 8 +// CK-64-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-64-NEXT: store ptr @.offload_maptypes.14, ptr [[TMP12]], align 8 +// CK-64-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-64-NEXT: store ptr null, ptr [[TMP13]], align 8 +// CK-64-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-64-NEXT: store ptr null, ptr [[TMP14]], align 8 +// CK-64-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-64-NEXT: store i64 10, ptr [[TMP15]], align 8 +// CK-64-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-64-NEXT: store i64 0, ptr [[TMP16]], align 8 +// CK-64-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-64-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4 +// CK-64-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-64-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4 +// CK-64-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-64-NEXT: store i32 0, ptr [[TMP19]], align 4 +// CK-64-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.region_id, ptr [[KERNEL_ARGS]]) +// CK-64-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 +// CK-64-NEXT: br i1 [[TMP21]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-64: [[OMP_OFFLOAD_FAILED]]: +// CK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112(i64 [[TMP1]]) #[[ATTR2]] +// CK-64-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-64: [[OMP_OFFLOAD_CONT]]: +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112( +// CK-64-SAME: i64 [[X:%.*]]) #[[ATTR1]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[X_ADDR:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[X_CASTED:%.*]] = alloca i64, align 8 +// CK-64-NEXT: store i64 [[X]], ptr [[X_ADDR]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4 +// CK-64-NEXT: store i32 [[TMP0]], ptr [[X_CASTED]], align 4 +// CK-64-NEXT: [[TMP1:%.*]] = load i64, ptr [[X_CASTED]], align 8 +// CK-64-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined, i64 [[TMP1]]) +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined( +// CK-64-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[X:%.*]]) #[[ATTR1]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[X_ADDR:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[I:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[X_CASTED:%.*]] = alloca i64, align 8 +// CK-64-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CK-64-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CK-64-NEXT: store i64 [[X]], ptr [[X_ADDR]], align 8 +// CK-64-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 +// CK-64-NEXT: store i32 9, ptr [[DOTOMP_COMB_UB]], align 4 +// CK-64-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CK-64-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CK-64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CK-64-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CK-64-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CK-64-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CK-64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9 +// CK-64-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]] +// CK-64: [[COND_TRUE]]: +// CK-64-NEXT: br label %[[COND_END:.*]] +// CK-64: [[COND_FALSE]]: +// CK-64-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CK-64-NEXT: br label %[[COND_END]] +// CK-64: [[COND_END]]: +// CK-64-NEXT: [[COND:%.*]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[TMP3]], %[[COND_FALSE]] ] +// CK-64-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4 +// CK-64-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 +// CK-64-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4 +// CK-64-NEXT: br label %[[OMP_INNER_FOR_COND:.*]] +// CK-64: [[OMP_INNER_FOR_COND]]: +// CK-64-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CK-64-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CK-64-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]] +// CK-64-NEXT: br i1 [[CMP1]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]] +// CK-64: [[OMP_INNER_FOR_BODY]]: +// CK-64-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 +// CK-64-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CK-64-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CK-64-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CK-64-NEXT: [[TMP11:%.*]] = load i32, ptr [[X_ADDR]], align 4 +// CK-64-NEXT: store i32 [[TMP11]], ptr [[X_CASTED]], align 4 +// CK-64-NEXT: [[TMP12:%.*]] = load i64, ptr [[X_CASTED]], align 8 +// CK-64-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]], i64 [[TMP12]]) +// CK-64-NEXT: br label %[[OMP_INNER_FOR_INC:.*]] +// CK-64: [[OMP_INNER_FOR_INC]]: +// CK-64-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CK-64-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4 +// CK-64-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]] +// CK-64-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4 +// CK-64-NEXT: br label %[[OMP_INNER_FOR_COND]] +// CK-64: [[OMP_INNER_FOR_END]]: +// CK-64-NEXT: br label %[[OMP_LOOP_EXIT:.*]] +// CK-64: [[OMP_LOOP_EXIT]]: +// CK-64-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP1]]) +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined.omp_outlined( +// CK-64-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[X:%.*]]) #[[ATTR1]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[X_ADDR:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[I:%.*]] = alloca i32, align 4 +// CK-64-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CK-64-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CK-64-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8 +// CK-64-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8 +// CK-64-NEXT: store i64 [[X]], ptr [[X_ADDR]], align 8 +// CK-64-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CK-64-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4 +// CK-64-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8 +// CK-64-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32 +// CK-64-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8 +// CK-64-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32 +// CK-64-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4 +// CK-64-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4 +// CK-64-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CK-64-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CK-64-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CK-64-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 +// CK-64-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CK-64-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CK-64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 +// CK-64-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]] +// CK-64: [[COND_TRUE]]: +// CK-64-NEXT: br label %[[COND_END:.*]] +// CK-64: [[COND_FALSE]]: +// CK-64-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CK-64-NEXT: br label %[[COND_END]] +// CK-64: [[COND_END]]: +// CK-64-NEXT: [[COND:%.*]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[TMP5]], %[[COND_FALSE]] ] +// CK-64-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CK-64-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CK-64-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4 +// CK-64-NEXT: br label %[[OMP_INNER_FOR_COND:.*]] +// CK-64: [[OMP_INNER_FOR_COND]]: +// CK-64-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CK-64-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CK-64-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] +// CK-64-NEXT: br i1 [[CMP2]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]] +// CK-64: [[OMP_INNER_FOR_BODY]]: +// CK-64-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CK-64-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1 +// CK-64-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CK-64-NEXT: store i32 [[ADD]], ptr [[I]], align 4 +// CK-64-NEXT: [[TMP10:%.*]] = load i32, ptr [[X_ADDR]], align 4 +// CK-64-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CK-64-NEXT: store i32 [[ADD3]], ptr [[X_ADDR]], align 4 +// CK-64-NEXT: br label %[[OMP_BODY_CONTINUE:.*]] +// CK-64: [[OMP_BODY_CONTINUE]]: +// CK-64-NEXT: br label %[[OMP_INNER_FOR_INC:.*]] +// CK-64: [[OMP_INNER_FOR_INC]]: +// CK-64-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CK-64-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], 1 +// CK-64-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4 +// CK-64-NEXT: br label %[[OMP_INNER_FOR_COND]] +// CK-64: [[OMP_INNER_FOR_END]]: +// CK-64-NEXT: br label %[[OMP_LOOP_EXIT:.*]] +// CK-64: [[OMP_LOOP_EXIT]]: +// CK-64-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP3]]) +// CK-64-NEXT: ret void +// +// +// CK-32-LABEL: define dso_local void @_Z4foo1i( +// CK-32-SAME: i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[D:%.*]] = alloca double, align 8 +// CK-32-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-32-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CK-32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// CK-32-NEXT: store double [[CONV]], ptr [[D]], align 8 +// CK-32-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[D]], ptr [[TMP1]], align 4 +// CK-32-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[D]], ptr [[TMP2]], align 4 +// CK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CK-32-NEXT: store ptr null, ptr [[TMP3]], align 4 +// CK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-32-NEXT: store i32 3, ptr [[TMP6]], align 4 +// CK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-32-NEXT: store i32 1, ptr [[TMP7]], align 4 +// CK-32-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 4 +// CK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-32-NEXT: store ptr [[TMP5]], ptr [[TMP9]], align 4 +// CK-32-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-32-NEXT: store ptr @.offload_sizes, ptr [[TMP10]], align 4 +// CK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-32-NEXT: store ptr @.offload_maptypes, ptr [[TMP11]], align 4 +// CK-32-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-32-NEXT: store ptr null, ptr [[TMP12]], align 4 +// CK-32-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-32-NEXT: store ptr null, ptr [[TMP13]], align 4 +// CK-32-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-32-NEXT: store i64 0, ptr [[TMP14]], align 8 +// CK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-32-NEXT: store i64 0, ptr [[TMP15]], align 8 +// CK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-32-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4 +// CK-32-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-32-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4 +// CK-32-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-32-NEXT: store i32 0, ptr [[TMP18]], align 4 +// CK-32-NEXT: [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23.region_id, ptr [[KERNEL_ARGS]]) +// CK-32-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CK-32-NEXT: br i1 [[TMP20]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-32: [[OMP_OFFLOAD_FAILED]]: +// CK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23(ptr [[D]]) #[[ATTR2:[0-9]+]] +// CK-32-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-32: [[OMP_OFFLOAD_CONT]]: +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23( +// CK-32-SAME: ptr nonnull align 4 dereferenceable(8) [[D:%.*]]) #[[ATTR1:[0-9]+]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[D1:%.*]] = alloca double, align 8 +// CK-32-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META20:![0-9]+]], !align [[META21:![0-9]+]] +// CK-32-NEXT: [[TMP1:%.*]] = load double, ptr [[D1]], align 8 +// CK-32-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// CK-32-NEXT: store double [[ADD]], ptr [[D1]], align 8 +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define dso_local void @_Z4foo2v( +// CK-32-SAME: ) #[[ATTR0]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-32-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[TMP0]], align 4 +// CK-32-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[TMP1]], align 4 +// CK-32-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CK-32-NEXT: store ptr null, ptr [[TMP2]], align 4 +// CK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-32-NEXT: store i32 3, ptr [[TMP5]], align 4 +// CK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-32-NEXT: store i32 1, ptr [[TMP6]], align 4 +// CK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 4 +// CK-32-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-32-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 4 +// CK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-32-NEXT: store ptr @.offload_sizes.1, ptr [[TMP9]], align 4 +// CK-32-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-32-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP10]], align 4 +// CK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-32-NEXT: store ptr null, ptr [[TMP11]], align 4 +// CK-32-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-32-NEXT: store ptr null, ptr [[TMP12]], align 4 +// CK-32-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-32-NEXT: store i64 0, ptr [[TMP13]], align 8 +// CK-32-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-32-NEXT: store i64 0, ptr [[TMP14]], align 8 +// CK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-32-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP15]], align 4 +// CK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-32-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4 +// CK-32-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-32-NEXT: store i32 0, ptr [[TMP17]], align 4 +// CK-32-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32.region_id, ptr [[KERNEL_ARGS]]) +// CK-32-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 +// CK-32-NEXT: br i1 [[TMP19]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-32: [[OMP_OFFLOAD_FAILED]]: +// CK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32(ptr [[PVTARR]]) #[[ATTR2]] +// CK-32-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-32: [[OMP_OFFLOAD_CONT]]: +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32( +// CK-32-SAME: ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]]) #[[ATTR1]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[PVTARR_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[PVTARR1:%.*]] = alloca [10 x i32], align 4 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 4, !nonnull [[META20]], !align [[META21]] +// CK-32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i32 0, i32 5 +// CK-32-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CK-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CK-32-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define dso_local void @_Z4foo3v( +// CK-32-SAME: ) #[[ATTR0]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[PA:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PA]], align 4 +// CK-32-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[TMP0]], ptr [[TMP1]], align 4 +// CK-32-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[TMP0]], ptr [[TMP2]], align 4 +// CK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CK-32-NEXT: store ptr null, ptr [[TMP3]], align 4 +// CK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-32-NEXT: store i32 3, ptr [[TMP6]], align 4 +// CK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-32-NEXT: store i32 1, ptr [[TMP7]], align 4 +// CK-32-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 4 +// CK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-32-NEXT: store ptr [[TMP5]], ptr [[TMP9]], align 4 +// CK-32-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-32-NEXT: store ptr @.offload_sizes.3, ptr [[TMP10]], align 4 +// CK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-32-NEXT: store ptr @.offload_maptypes.4, ptr [[TMP11]], align 4 +// CK-32-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-32-NEXT: store ptr null, ptr [[TMP12]], align 4 +// CK-32-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-32-NEXT: store ptr null, ptr [[TMP13]], align 4 +// CK-32-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-32-NEXT: store i64 0, ptr [[TMP14]], align 8 +// CK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-32-NEXT: store i64 0, ptr [[TMP15]], align 8 +// CK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-32-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4 +// CK-32-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-32-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4 +// CK-32-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-32-NEXT: store i32 0, ptr [[TMP18]], align 4 +// CK-32-NEXT: [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41.region_id, ptr [[KERNEL_ARGS]]) +// CK-32-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CK-32-NEXT: br i1 [[TMP20]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-32: [[OMP_OFFLOAD_FAILED]]: +// CK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41(ptr [[TMP0]]) #[[ATTR2]] +// CK-32-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-32: [[OMP_OFFLOAD_CONT]]: +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41( +// CK-32-SAME: ptr [[PA:%.*]]) #[[ATTR1]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[PA_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[PA1:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: store ptr [[PA]], ptr [[PA_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PA1]], align 4 +// CK-32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 50 +// CK-32-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CK-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CK-32-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define dso_local void @_Z4foo4v( +// CK-32-SAME: ) #[[ATTR0]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[P:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[P_CASTED:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[P]], align 4 +// CK-32-NEXT: store i32 [[TMP0]], ptr [[P_CASTED]], align 4 +// CK-32-NEXT: [[TMP1:%.*]] = load i32, ptr [[P_CASTED]], align 4 +// CK-32-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: store i32 [[TMP1]], ptr [[TMP2]], align 4 +// CK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: store i32 [[TMP1]], ptr [[TMP3]], align 4 +// CK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CK-32-NEXT: store ptr null, ptr [[TMP4]], align 4 +// CK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-32-NEXT: store i32 3, ptr [[TMP7]], align 4 +// CK-32-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-32-NEXT: store i32 1, ptr [[TMP8]], align 4 +// CK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP5]], ptr [[TMP9]], align 4 +// CK-32-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-32-NEXT: store ptr [[TMP6]], ptr [[TMP10]], align 4 +// CK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-32-NEXT: store ptr @.offload_sizes.5, ptr [[TMP11]], align 4 +// CK-32-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-32-NEXT: store ptr @.offload_maptypes.6, ptr [[TMP12]], align 4 +// CK-32-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-32-NEXT: store ptr null, ptr [[TMP13]], align 4 +// CK-32-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-32-NEXT: store ptr null, ptr [[TMP14]], align 4 +// CK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-32-NEXT: store i64 0, ptr [[TMP15]], align 8 +// CK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-32-NEXT: store i64 0, ptr [[TMP16]], align 8 +// CK-32-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-32-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP17]], align 4 +// CK-32-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-32-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4 +// CK-32-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-32-NEXT: store i32 0, ptr [[TMP19]], align 4 +// CK-32-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52.region_id, ptr [[KERNEL_ARGS]]) +// CK-32-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 +// CK-32-NEXT: br i1 [[TMP21]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-32: [[OMP_OFFLOAD_FAILED]]: +// CK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52(i32 [[TMP1]]) #[[ATTR2]] +// CK-32-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-32: [[OMP_OFFLOAD_CONT]]: +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52( +// CK-32-SAME: i32 [[P:%.*]]) #[[ATTR1]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[P_ADDR:%.*]] = alloca i32, align 4 +// CK-32-NEXT: store i32 [[P]], ptr [[P_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[P_ADDR]], align 4 +// CK-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +// CK-32-NEXT: store i32 [[INC]], ptr [[P_ADDR]], align 4 +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define dso_local void @_Z4foo5i( +// CK-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[D:%.*]] = alloca double, align 8 +// CK-32-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// CK-32-NEXT: [[PA:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4 +// CK-32-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-32-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CK-32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// CK-32-NEXT: store double [[CONV]], ptr [[D]], align 8 +// CK-32-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PA]], align 4 +// CK-32-NEXT: [[TMP2:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[D]], ptr [[TMP2]], align 4 +// CK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[D]], ptr [[TMP3]], align 4 +// CK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CK-32-NEXT: store ptr null, ptr [[TMP4]], align 4 +// CK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[TMP5]], align 4 +// CK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[TMP6]], align 4 +// CK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1 +// CK-32-NEXT: store ptr null, ptr [[TMP7]], align 4 +// CK-32-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP1]], ptr [[TMP8]], align 4 +// CK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP1]], ptr [[TMP9]], align 4 +// CK-32-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2 +// CK-32-NEXT: store ptr null, ptr [[TMP10]], align 4 +// CK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-32-NEXT: store i32 3, ptr [[TMP13]], align 4 +// CK-32-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-32-NEXT: store i32 3, ptr [[TMP14]], align 4 +// CK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP11]], ptr [[TMP15]], align 4 +// CK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-32-NEXT: store ptr [[TMP12]], ptr [[TMP16]], align 4 +// CK-32-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-32-NEXT: store ptr @.offload_sizes.7, ptr [[TMP17]], align 4 +// CK-32-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-32-NEXT: store ptr @.offload_maptypes.8, ptr [[TMP18]], align 4 +// CK-32-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-32-NEXT: store ptr null, ptr [[TMP19]], align 4 +// CK-32-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-32-NEXT: store ptr null, ptr [[TMP20]], align 4 +// CK-32-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-32-NEXT: store i64 0, ptr [[TMP21]], align 8 +// CK-32-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-32-NEXT: store i64 0, ptr [[TMP22]], align 8 +// CK-32-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-32-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP23]], align 4 +// CK-32-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-32-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP24]], align 4 +// CK-32-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-32-NEXT: store i32 0, ptr [[TMP25]], align 4 +// CK-32-NEXT: [[TMP26:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66.region_id, ptr [[KERNEL_ARGS]]) +// CK-32-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CK-32-NEXT: br i1 [[TMP27]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-32: [[OMP_OFFLOAD_FAILED]]: +// CK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66(ptr [[D]], ptr [[PVTARR]], ptr [[TMP1]]) #[[ATTR2]] +// CK-32-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-32: [[OMP_OFFLOAD_CONT]]: +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66( +// CK-32-SAME: ptr nonnull align 4 dereferenceable(8) [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[PVTARR_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[PA_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[D1:%.*]] = alloca double, align 8 +// CK-32-NEXT: [[PVTARR2:%.*]] = alloca [10 x i32], align 4 +// CK-32-NEXT: [[PA3:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 4 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 4 +// CK-32-NEXT: store ptr [[PA]], ptr [[PA_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META20]], !align [[META21]] +// CK-32-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 4, !nonnull [[META20]], !align [[META21]] +// CK-32-NEXT: [[TMP2:%.*]] = load double, ptr [[D1]], align 8 +// CK-32-NEXT: [[ADD:%.*]] = fadd double [[TMP2]], 1.000000e+00 +// CK-32-NEXT: store double [[ADD]], ptr [[D1]], align 8 +// CK-32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR2]], i32 0, i32 5 +// CK-32-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CK-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1 +// CK-32-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CK-32-NEXT: [[TMP4:%.*]] = load ptr, ptr [[PA3]], align 4 +// CK-32-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 50 +// CK-32-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4 +// CK-32-NEXT: [[INC5:%.*]] = add nsw i32 [[TMP5]], 1 +// CK-32-NEXT: store i32 [[INC5]], ptr [[ARRAYIDX4]], align 4 +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define dso_local void @_Z4foo6i( +// CK-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[D:%.*]] = alloca double, align 8 +// CK-32-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// CK-32-NEXT: [[PA:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4 +// CK-32-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-32-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CK-32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// CK-32-NEXT: store double [[CONV]], ptr [[D]], align 8 +// CK-32-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PA]], align 4 +// CK-32-NEXT: [[TMP2:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[D]], ptr [[TMP2]], align 4 +// CK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[D]], ptr [[TMP3]], align 4 +// CK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CK-32-NEXT: store ptr null, ptr [[TMP4]], align 4 +// CK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[TMP5]], align 4 +// CK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[TMP6]], align 4 +// CK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1 +// CK-32-NEXT: store ptr null, ptr [[TMP7]], align 4 +// CK-32-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP1]], ptr [[TMP8]], align 4 +// CK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP1]], ptr [[TMP9]], align 4 +// CK-32-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2 +// CK-32-NEXT: store ptr null, ptr [[TMP10]], align 4 +// CK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-32-NEXT: store i32 3, ptr [[TMP13]], align 4 +// CK-32-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-32-NEXT: store i32 3, ptr [[TMP14]], align 4 +// CK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP11]], ptr [[TMP15]], align 4 +// CK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-32-NEXT: store ptr [[TMP12]], ptr [[TMP16]], align 4 +// CK-32-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-32-NEXT: store ptr @.offload_sizes.9, ptr [[TMP17]], align 4 +// CK-32-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-32-NEXT: store ptr @.offload_maptypes.10, ptr [[TMP18]], align 4 +// CK-32-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-32-NEXT: store ptr null, ptr [[TMP19]], align 4 +// CK-32-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-32-NEXT: store ptr null, ptr [[TMP20]], align 4 +// CK-32-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-32-NEXT: store i64 0, ptr [[TMP21]], align 8 +// CK-32-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-32-NEXT: store i64 0, ptr [[TMP22]], align 8 +// CK-32-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-32-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP23]], align 4 +// CK-32-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-32-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP24]], align 4 +// CK-32-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-32-NEXT: store i32 0, ptr [[TMP25]], align 4 +// CK-32-NEXT: [[TMP26:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82.region_id, ptr [[KERNEL_ARGS]]) +// CK-32-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CK-32-NEXT: br i1 [[TMP27]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-32: [[OMP_OFFLOAD_FAILED]]: +// CK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82(ptr [[D]], ptr [[PVTARR]], ptr [[TMP1]]) #[[ATTR2]] +// CK-32-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-32: [[OMP_OFFLOAD_CONT]]: +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82( +// CK-32-SAME: ptr nonnull align 4 dereferenceable(8) [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[PVTARR_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[PA_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[D1:%.*]] = alloca double, align 8 +// CK-32-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 4 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 4 +// CK-32-NEXT: store ptr [[PA]], ptr [[PA_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META20]], !align [[META21]] +// CK-32-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 4, !nonnull [[META20]], !align [[META21]] +// CK-32-NEXT: [[TMP2:%.*]] = load double, ptr [[TMP0]], align 8 +// CK-32-NEXT: store double [[TMP2]], ptr [[D1]], align 8 +// CK-32-NEXT: [[TMP3:%.*]] = load double, ptr [[D1]], align 8 +// CK-32-NEXT: [[ADD:%.*]] = fadd double [[TMP3]], 1.000000e+00 +// CK-32-NEXT: store double [[ADD]], ptr [[D1]], align 8 +// CK-32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP1]], i32 0, i32 5 +// CK-32-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CK-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +// CK-32-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CK-32-NEXT: [[TMP5:%.*]] = load ptr, ptr [[PA_ADDR]], align 4 +// CK-32-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 50 +// CK-32-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +// CK-32-NEXT: [[INC3:%.*]] = add nsw i32 [[TMP6]], 1 +// CK-32-NEXT: store i32 [[INC3]], ptr [[ARRAYIDX2]], align 4 +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define dso_local void @_Z4foo7i( +// CK-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[D:%.*]] = alloca double, align 8 +// CK-32-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// CK-32-NEXT: [[PA:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4 +// CK-32-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-32-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CK-32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// CK-32-NEXT: store double [[CONV]], ptr [[D]], align 8 +// CK-32-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PA]], align 4 +// CK-32-NEXT: [[TMP2:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[D]], ptr [[TMP2]], align 4 +// CK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[D]], ptr [[TMP3]], align 4 +// CK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CK-32-NEXT: store ptr null, ptr [[TMP4]], align 4 +// CK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[TMP5]], align 4 +// CK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[TMP6]], align 4 +// CK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1 +// CK-32-NEXT: store ptr null, ptr [[TMP7]], align 4 +// CK-32-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP1]], ptr [[TMP8]], align 4 +// CK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP1]], ptr [[TMP9]], align 4 +// CK-32-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2 +// CK-32-NEXT: store ptr null, ptr [[TMP10]], align 4 +// CK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-32-NEXT: store i32 3, ptr [[TMP13]], align 4 +// CK-32-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-32-NEXT: store i32 3, ptr [[TMP14]], align 4 +// CK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP11]], ptr [[TMP15]], align 4 +// CK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-32-NEXT: store ptr [[TMP12]], ptr [[TMP16]], align 4 +// CK-32-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-32-NEXT: store ptr @.offload_sizes.11, ptr [[TMP17]], align 4 +// CK-32-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-32-NEXT: store ptr @.offload_maptypes.12, ptr [[TMP18]], align 4 +// CK-32-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-32-NEXT: store ptr null, ptr [[TMP19]], align 4 +// CK-32-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-32-NEXT: store ptr null, ptr [[TMP20]], align 4 +// CK-32-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-32-NEXT: store i64 0, ptr [[TMP21]], align 8 +// CK-32-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-32-NEXT: store i64 0, ptr [[TMP22]], align 8 +// CK-32-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-32-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP23]], align 4 +// CK-32-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-32-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP24]], align 4 +// CK-32-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-32-NEXT: store i32 0, ptr [[TMP25]], align 4 +// CK-32-NEXT: [[TMP26:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98.region_id, ptr [[KERNEL_ARGS]]) +// CK-32-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CK-32-NEXT: br i1 [[TMP27]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-32: [[OMP_OFFLOAD_FAILED]]: +// CK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98(ptr [[D]], ptr [[PVTARR]], ptr [[TMP1]]) #[[ATTR2]] +// CK-32-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-32: [[OMP_OFFLOAD_CONT]]: +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98( +// CK-32-SAME: ptr nonnull align 4 dereferenceable(8) [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[PVTARR_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[PA_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[D1:%.*]] = alloca double, align 8 +// CK-32-NEXT: [[PVTARR2:%.*]] = alloca [10 x i32], align 4 +// CK-32-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 4 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 4 +// CK-32-NEXT: store ptr [[PA]], ptr [[PA_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META20]], !align [[META21]] +// CK-32-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 4, !nonnull [[META20]], !align [[META21]] +// CK-32-NEXT: [[TMP2:%.*]] = load double, ptr [[TMP0]], align 8 +// CK-32-NEXT: store double [[TMP2]], ptr [[D1]], align 8 +// CK-32-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[PVTARR2]], ptr align 4 [[TMP1]], i32 40, i1 false) +// CK-32-NEXT: [[TMP3:%.*]] = load double, ptr [[D1]], align 8 +// CK-32-NEXT: [[ADD:%.*]] = fadd double [[TMP3]], 1.000000e+00 +// CK-32-NEXT: store double [[ADD]], ptr [[D1]], align 8 +// CK-32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR2]], i32 0, i32 5 +// CK-32-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CK-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +// CK-32-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CK-32-NEXT: [[TMP5:%.*]] = load ptr, ptr [[PA_ADDR]], align 4 +// CK-32-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 50 +// CK-32-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 +// CK-32-NEXT: [[INC4:%.*]] = add nsw i32 [[TMP6]], 1 +// CK-32-NEXT: store i32 [[INC4]], ptr [[ARRAYIDX3]], align 4 +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define dso_local void @_Z4foo8v( +// CK-32-SAME: ) #[[ATTR0]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[X:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[X_CASTED:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-32-NEXT: store i32 0, ptr [[X]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4 +// CK-32-NEXT: store i32 [[TMP0]], ptr [[X_CASTED]], align 4 +// CK-32-NEXT: [[TMP1:%.*]] = load i32, ptr [[X_CASTED]], align 4 +// CK-32-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: store i32 [[TMP1]], ptr [[TMP2]], align 4 +// CK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: store i32 [[TMP1]], ptr [[TMP3]], align 4 +// CK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CK-32-NEXT: store ptr null, ptr [[TMP4]], align 4 +// CK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-32-NEXT: store i32 3, ptr [[TMP7]], align 4 +// CK-32-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-32-NEXT: store i32 1, ptr [[TMP8]], align 4 +// CK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP5]], ptr [[TMP9]], align 4 +// CK-32-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-32-NEXT: store ptr [[TMP6]], ptr [[TMP10]], align 4 +// CK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-32-NEXT: store ptr @.offload_sizes.13, ptr [[TMP11]], align 4 +// CK-32-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-32-NEXT: store ptr @.offload_maptypes.14, ptr [[TMP12]], align 4 +// CK-32-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-32-NEXT: store ptr null, ptr [[TMP13]], align 4 +// CK-32-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-32-NEXT: store ptr null, ptr [[TMP14]], align 4 +// CK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-32-NEXT: store i64 10, ptr [[TMP15]], align 8 +// CK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-32-NEXT: store i64 0, ptr [[TMP16]], align 8 +// CK-32-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-32-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4 +// CK-32-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-32-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4 +// CK-32-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-32-NEXT: store i32 0, ptr [[TMP19]], align 4 +// CK-32-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.region_id, ptr [[KERNEL_ARGS]]) +// CK-32-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 +// CK-32-NEXT: br i1 [[TMP21]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-32: [[OMP_OFFLOAD_FAILED]]: +// CK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112(i32 [[TMP1]]) #[[ATTR2]] +// CK-32-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-32: [[OMP_OFFLOAD_CONT]]: +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112( +// CK-32-SAME: i32 [[X:%.*]]) #[[ATTR1]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[X_ADDR:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[X_CASTED:%.*]] = alloca i32, align 4 +// CK-32-NEXT: store i32 [[X]], ptr [[X_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4 +// CK-32-NEXT: store i32 [[TMP0]], ptr [[X_CASTED]], align 4 +// CK-32-NEXT: [[TMP1:%.*]] = load i32, ptr [[X_CASTED]], align 4 +// CK-32-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined, i32 [[TMP1]]) +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined( +// CK-32-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[X:%.*]]) #[[ATTR1]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[X_ADDR:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[I:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[X_CASTED:%.*]] = alloca i32, align 4 +// CK-32-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CK-32-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CK-32-NEXT: store i32 [[X]], ptr [[X_ADDR]], align 4 +// CK-32-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 +// CK-32-NEXT: store i32 9, ptr [[DOTOMP_COMB_UB]], align 4 +// CK-32-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CK-32-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CK-32-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CK-32-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CK-32-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CK-32-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9 +// CK-32-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]] +// CK-32: [[COND_TRUE]]: +// CK-32-NEXT: br label %[[COND_END:.*]] +// CK-32: [[COND_FALSE]]: +// CK-32-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CK-32-NEXT: br label %[[COND_END]] +// CK-32: [[COND_END]]: +// CK-32-NEXT: [[COND:%.*]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[TMP3]], %[[COND_FALSE]] ] +// CK-32-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4 +// CK-32-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 +// CK-32-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4 +// CK-32-NEXT: br label %[[OMP_INNER_FOR_COND:.*]] +// CK-32: [[OMP_INNER_FOR_COND]]: +// CK-32-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CK-32-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CK-32-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]] +// CK-32-NEXT: br i1 [[CMP1]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]] +// CK-32: [[OMP_INNER_FOR_BODY]]: +// CK-32-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 +// CK-32-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CK-32-NEXT: [[TMP9:%.*]] = load i32, ptr [[X_ADDR]], align 4 +// CK-32-NEXT: store i32 [[TMP9]], ptr [[X_CASTED]], align 4 +// CK-32-NEXT: [[TMP10:%.*]] = load i32, ptr [[X_CASTED]], align 4 +// CK-32-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined.omp_outlined, i32 [[TMP7]], i32 [[TMP8]], i32 [[TMP10]]) +// CK-32-NEXT: br label %[[OMP_INNER_FOR_INC:.*]] +// CK-32: [[OMP_INNER_FOR_INC]]: +// CK-32-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CK-32-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4 +// CK-32-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CK-32-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4 +// CK-32-NEXT: br label %[[OMP_INNER_FOR_COND]] +// CK-32: [[OMP_INNER_FOR_END]]: +// CK-32-NEXT: br label %[[OMP_LOOP_EXIT:.*]] +// CK-32: [[OMP_LOOP_EXIT]]: +// CK-32-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP1]]) +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined.omp_outlined( +// CK-32-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[X:%.*]]) #[[ATTR1]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[X_ADDR:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[I:%.*]] = alloca i32, align 4 +// CK-32-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CK-32-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CK-32-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4 +// CK-32-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4 +// CK-32-NEXT: store i32 [[X]], ptr [[X_ADDR]], align 4 +// CK-32-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CK-32-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4 +// CK-32-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4 +// CK-32-NEXT: store i32 [[TMP0]], ptr [[DOTOMP_LB]], align 4 +// CK-32-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_UB]], align 4 +// CK-32-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CK-32-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CK-32-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CK-32-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 +// CK-32-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CK-32-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CK-32-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 +// CK-32-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]] +// CK-32: [[COND_TRUE]]: +// CK-32-NEXT: br label %[[COND_END:.*]] +// CK-32: [[COND_FALSE]]: +// CK-32-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CK-32-NEXT: br label %[[COND_END]] +// CK-32: [[COND_END]]: +// CK-32-NEXT: [[COND:%.*]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[TMP5]], %[[COND_FALSE]] ] +// CK-32-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CK-32-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CK-32-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4 +// CK-32-NEXT: br label %[[OMP_INNER_FOR_COND:.*]] +// CK-32: [[OMP_INNER_FOR_COND]]: +// CK-32-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CK-32-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CK-32-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] +// CK-32-NEXT: br i1 [[CMP1]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]] +// CK-32: [[OMP_INNER_FOR_BODY]]: +// CK-32-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CK-32-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1 +// CK-32-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CK-32-NEXT: store i32 [[ADD]], ptr [[I]], align 4 +// CK-32-NEXT: [[TMP10:%.*]] = load i32, ptr [[X_ADDR]], align 4 +// CK-32-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1 +// CK-32-NEXT: store i32 [[ADD2]], ptr [[X_ADDR]], align 4 +// CK-32-NEXT: br label %[[OMP_BODY_CONTINUE:.*]] +// CK-32: [[OMP_BODY_CONTINUE]]: +// CK-32-NEXT: br label %[[OMP_INNER_FOR_INC:.*]] +// CK-32: [[OMP_INNER_FOR_INC]]: +// CK-32-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CK-32-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP11]], 1 +// CK-32-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4 +// CK-32-NEXT: br label %[[OMP_INNER_FOR_COND]] +// CK-32: [[OMP_INNER_FOR_END]]: +// CK-32-NEXT: br label %[[OMP_LOOP_EXIT:.*]] +// CK-32: [[OMP_LOOP_EXIT]]: +// CK-32-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP3]]) +// CK-32-NEXT: ret void +// +// +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo1i( +// SIMD-ONLY-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// SIMD-ONLY-64-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-64-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// SIMD-ONLY-64-NEXT: [[D:%.*]] = alloca double, align 8 +// SIMD-ONLY-64-NEXT: [[D1:%.*]] = alloca double, align 8 +// SIMD-ONLY-64-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// SIMD-ONLY-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// SIMD-ONLY-64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// SIMD-ONLY-64-NEXT: store double [[CONV]], ptr [[D]], align 8 +// SIMD-ONLY-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D1]], align 8 +// SIMD-ONLY-64-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// SIMD-ONLY-64-NEXT: store double [[ADD]], ptr [[D1]], align 8 +// SIMD-ONLY-64-NEXT: ret void +// +// +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo2v( +// SIMD-ONLY-64-SAME: ) #[[ATTR0]] { +// SIMD-ONLY-64-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-64-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-64-NEXT: [[PVTARR1:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i64 0, i64 5 +// SIMD-ONLY-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-64-NEXT: ret void +// +// +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo3v( +// SIMD-ONLY-64-SAME: ) #[[ATTR0]] { +// SIMD-ONLY-64-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-64-NEXT: [[PA:%.*]] = alloca ptr, align 8 +// SIMD-ONLY-64-NEXT: [[PA1:%.*]] = alloca ptr, align 8 +// SIMD-ONLY-64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PA1]], align 8 +// SIMD-ONLY-64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 50 +// SIMD-ONLY-64-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-64-NEXT: ret void +// +// +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo4v( +// SIMD-ONLY-64-SAME: ) #[[ATTR0]] { +// SIMD-ONLY-64-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-64-NEXT: [[P:%.*]] = alloca i32, align 4 +// SIMD-ONLY-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[P]], align 4 +// SIMD-ONLY-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[INC]], ptr [[P]], align 4 +// SIMD-ONLY-64-NEXT: ret void +// +// +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo5i( +// SIMD-ONLY-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] { +// SIMD-ONLY-64-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-64-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// SIMD-ONLY-64-NEXT: [[D:%.*]] = alloca double, align 8 +// SIMD-ONLY-64-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-64-NEXT: [[PA:%.*]] = alloca ptr, align 8 +// SIMD-ONLY-64-NEXT: [[D1:%.*]] = alloca double, align 8 +// SIMD-ONLY-64-NEXT: [[PVTARR2:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-64-NEXT: [[PA3:%.*]] = alloca ptr, align 8 +// SIMD-ONLY-64-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// SIMD-ONLY-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// SIMD-ONLY-64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// SIMD-ONLY-64-NEXT: store double [[CONV]], ptr [[D]], align 8 +// SIMD-ONLY-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D1]], align 8 +// SIMD-ONLY-64-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// SIMD-ONLY-64-NEXT: store double [[ADD]], ptr [[D1]], align 8 +// SIMD-ONLY-64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR2]], i64 0, i64 5 +// SIMD-ONLY-64-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-64-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA3]], align 8 +// SIMD-ONLY-64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50 +// SIMD-ONLY-64-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4 +// SIMD-ONLY-64-NEXT: [[INC5:%.*]] = add nsw i32 [[TMP4]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[INC5]], ptr [[ARRAYIDX4]], align 4 +// SIMD-ONLY-64-NEXT: ret void +// +// +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo6i( +// SIMD-ONLY-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] { +// SIMD-ONLY-64-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-64-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// SIMD-ONLY-64-NEXT: [[D:%.*]] = alloca double, align 8 +// SIMD-ONLY-64-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-64-NEXT: [[PA:%.*]] = alloca ptr, align 8 +// SIMD-ONLY-64-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// SIMD-ONLY-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// SIMD-ONLY-64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// SIMD-ONLY-64-NEXT: store double [[CONV]], ptr [[D]], align 8 +// SIMD-ONLY-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D]], align 8 +// SIMD-ONLY-64-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// SIMD-ONLY-64-NEXT: store double [[ADD]], ptr [[D]], align 8 +// SIMD-ONLY-64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR]], i64 0, i64 5 +// SIMD-ONLY-64-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-64-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA]], align 8 +// SIMD-ONLY-64-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50 +// SIMD-ONLY-64-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +// SIMD-ONLY-64-NEXT: [[INC2:%.*]] = add nsw i32 [[TMP4]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[INC2]], ptr [[ARRAYIDX1]], align 4 +// SIMD-ONLY-64-NEXT: ret void +// +// +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo7i( +// SIMD-ONLY-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] { +// SIMD-ONLY-64-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-64-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// SIMD-ONLY-64-NEXT: [[D:%.*]] = alloca double, align 8 +// SIMD-ONLY-64-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-64-NEXT: [[PA:%.*]] = alloca ptr, align 8 +// SIMD-ONLY-64-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// SIMD-ONLY-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// SIMD-ONLY-64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// SIMD-ONLY-64-NEXT: store double [[CONV]], ptr [[D]], align 8 +// SIMD-ONLY-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D]], align 8 +// SIMD-ONLY-64-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// SIMD-ONLY-64-NEXT: store double [[ADD]], ptr [[D]], align 8 +// SIMD-ONLY-64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR]], i64 0, i64 5 +// SIMD-ONLY-64-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-64-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA]], align 8 +// SIMD-ONLY-64-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50 +// SIMD-ONLY-64-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +// SIMD-ONLY-64-NEXT: [[INC2:%.*]] = add nsw i32 [[TMP4]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[INC2]], ptr [[ARRAYIDX1]], align 4 +// SIMD-ONLY-64-NEXT: ret void +// +// +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo8v( +// SIMD-ONLY-64-SAME: ) #[[ATTR0]] { +// SIMD-ONLY-64-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-64-NEXT: [[X:%.*]] = alloca i32, align 4 +// SIMD-ONLY-64-NEXT: [[I:%.*]] = alloca i32, align 4 +// SIMD-ONLY-64-NEXT: store i32 0, ptr [[X]], align 4 +// SIMD-ONLY-64-NEXT: store i32 0, ptr [[I]], align 4 +// SIMD-ONLY-64-NEXT: br label %[[FOR_COND:.*]] +// SIMD-ONLY-64: [[FOR_COND]]: +// SIMD-ONLY-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4 +// SIMD-ONLY-64-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 10 +// SIMD-ONLY-64-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]] +// SIMD-ONLY-64: [[FOR_BODY]]: +// SIMD-ONLY-64-NEXT: [[TMP1:%.*]] = load i32, ptr [[X]], align 4 +// SIMD-ONLY-64-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[ADD]], ptr [[X]], align 4 +// SIMD-ONLY-64-NEXT: br label %[[FOR_INC:.*]] +// SIMD-ONLY-64: [[FOR_INC]]: +// SIMD-ONLY-64-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4 +// SIMD-ONLY-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[INC]], ptr [[I]], align 4 +// SIMD-ONLY-64-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP2:![0-9]+]] +// SIMD-ONLY-64: [[FOR_END]]: +// SIMD-ONLY-64-NEXT: ret void +// +// +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo1i( +// SIMD-ONLY-32-SAME: i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// SIMD-ONLY-32-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-32-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// SIMD-ONLY-32-NEXT: [[D:%.*]] = alloca double, align 8 +// SIMD-ONLY-32-NEXT: [[D1:%.*]] = alloca double, align 8 +// SIMD-ONLY-32-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// SIMD-ONLY-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// SIMD-ONLY-32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// SIMD-ONLY-32-NEXT: store double [[CONV]], ptr [[D]], align 8 +// SIMD-ONLY-32-NEXT: [[TMP1:%.*]] = load double, ptr [[D1]], align 8 +// SIMD-ONLY-32-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// SIMD-ONLY-32-NEXT: store double [[ADD]], ptr [[D1]], align 8 +// SIMD-ONLY-32-NEXT: ret void +// +// +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo2v( +// SIMD-ONLY-32-SAME: ) #[[ATTR0]] { +// SIMD-ONLY-32-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-32-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-32-NEXT: [[PVTARR1:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i32 0, i32 5 +// SIMD-ONLY-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-32-NEXT: ret void +// +// +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo3v( +// SIMD-ONLY-32-SAME: ) #[[ATTR0]] { +// SIMD-ONLY-32-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-32-NEXT: [[PA:%.*]] = alloca ptr, align 4 +// SIMD-ONLY-32-NEXT: [[PA1:%.*]] = alloca ptr, align 4 +// SIMD-ONLY-32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PA1]], align 4 +// SIMD-ONLY-32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 50 +// SIMD-ONLY-32-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-32-NEXT: ret void +// +// +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo4v( +// SIMD-ONLY-32-SAME: ) #[[ATTR0]] { +// SIMD-ONLY-32-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-32-NEXT: [[P:%.*]] = alloca i32, align 4 +// SIMD-ONLY-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[P]], align 4 +// SIMD-ONLY-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[INC]], ptr [[P]], align 4 +// SIMD-ONLY-32-NEXT: ret void +// +// +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo5i( +// SIMD-ONLY-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] { +// SIMD-ONLY-32-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-32-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// SIMD-ONLY-32-NEXT: [[D:%.*]] = alloca double, align 8 +// SIMD-ONLY-32-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-32-NEXT: [[PA:%.*]] = alloca ptr, align 4 +// SIMD-ONLY-32-NEXT: [[D1:%.*]] = alloca double, align 8 +// SIMD-ONLY-32-NEXT: [[PVTARR2:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-32-NEXT: [[PA3:%.*]] = alloca ptr, align 4 +// SIMD-ONLY-32-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// SIMD-ONLY-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// SIMD-ONLY-32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// SIMD-ONLY-32-NEXT: store double [[CONV]], ptr [[D]], align 8 +// SIMD-ONLY-32-NEXT: [[TMP1:%.*]] = load double, ptr [[D1]], align 8 +// SIMD-ONLY-32-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// SIMD-ONLY-32-NEXT: store double [[ADD]], ptr [[D1]], align 8 +// SIMD-ONLY-32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR2]], i32 0, i32 5 +// SIMD-ONLY-32-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-32-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA3]], align 4 +// SIMD-ONLY-32-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 50 +// SIMD-ONLY-32-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4 +// SIMD-ONLY-32-NEXT: [[INC5:%.*]] = add nsw i32 [[TMP4]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[INC5]], ptr [[ARRAYIDX4]], align 4 +// SIMD-ONLY-32-NEXT: ret void +// +// +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo6i( +// SIMD-ONLY-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] { +// SIMD-ONLY-32-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-32-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// SIMD-ONLY-32-NEXT: [[D:%.*]] = alloca double, align 8 +// SIMD-ONLY-32-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-32-NEXT: [[PA:%.*]] = alloca ptr, align 4 +// SIMD-ONLY-32-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// SIMD-ONLY-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// SIMD-ONLY-32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// SIMD-ONLY-32-NEXT: store double [[CONV]], ptr [[D]], align 8 +// SIMD-ONLY-32-NEXT: [[TMP1:%.*]] = load double, ptr [[D]], align 8 +// SIMD-ONLY-32-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// SIMD-ONLY-32-NEXT: store double [[ADD]], ptr [[D]], align 8 +// SIMD-ONLY-32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR]], i32 0, i32 5 +// SIMD-ONLY-32-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-32-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA]], align 4 +// SIMD-ONLY-32-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 50 +// SIMD-ONLY-32-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +// SIMD-ONLY-32-NEXT: [[INC2:%.*]] = add nsw i32 [[TMP4]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[INC2]], ptr [[ARRAYIDX1]], align 4 +// SIMD-ONLY-32-NEXT: ret void +// +// +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo7i( +// SIMD-ONLY-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] { +// SIMD-ONLY-32-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-32-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// SIMD-ONLY-32-NEXT: [[D:%.*]] = alloca double, align 8 +// SIMD-ONLY-32-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-32-NEXT: [[PA:%.*]] = alloca ptr, align 4 +// SIMD-ONLY-32-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// SIMD-ONLY-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// SIMD-ONLY-32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// SIMD-ONLY-32-NEXT: store double [[CONV]], ptr [[D]], align 8 +// SIMD-ONLY-32-NEXT: [[TMP1:%.*]] = load double, ptr [[D]], align 8 +// SIMD-ONLY-32-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// SIMD-ONLY-32-NEXT: store double [[ADD]], ptr [[D]], align 8 +// SIMD-ONLY-32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR]], i32 0, i32 5 +// SIMD-ONLY-32-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-32-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA]], align 4 +// SIMD-ONLY-32-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 50 +// SIMD-ONLY-32-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +// SIMD-ONLY-32-NEXT: [[INC2:%.*]] = add nsw i32 [[TMP4]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[INC2]], ptr [[ARRAYIDX1]], align 4 +// SIMD-ONLY-32-NEXT: ret void +// +// +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo8v( +// SIMD-ONLY-32-SAME: ) #[[ATTR0]] { +// SIMD-ONLY-32-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-32-NEXT: [[X:%.*]] = alloca i32, align 4 +// SIMD-ONLY-32-NEXT: [[I:%.*]] = alloca i32, align 4 +// SIMD-ONLY-32-NEXT: store i32 0, ptr [[X]], align 4 +// SIMD-ONLY-32-NEXT: store i32 0, ptr [[I]], align 4 +// SIMD-ONLY-32-NEXT: br label %[[FOR_COND:.*]] +// SIMD-ONLY-32: [[FOR_COND]]: +// SIMD-ONLY-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4 +// SIMD-ONLY-32-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 10 +// SIMD-ONLY-32-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]] +// SIMD-ONLY-32: [[FOR_BODY]]: +// SIMD-ONLY-32-NEXT: [[TMP1:%.*]] = load i32, ptr [[X]], align 4 +// SIMD-ONLY-32-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[ADD]], ptr [[X]], align 4 +// SIMD-ONLY-32-NEXT: br label %[[FOR_INC:.*]] +// SIMD-ONLY-32: [[FOR_INC]]: +// SIMD-ONLY-32-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4 +// SIMD-ONLY-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[INC]], ptr [[I]], align 4 +// SIMD-ONLY-32-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] +// SIMD-ONLY-32: [[FOR_END]]: +// SIMD-ONLY-32-NEXT: ret void +// +//. +// CK-64: [[META19]] = !{} +// CK-64: [[META20]] = !{i64 4} +//. +// CK-32: [[META20]] = !{} +// CK-32: [[META21]] = !{i64 4} +//. +// SIMD-ONLY-64: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]} +// SIMD-ONLY-64: [[META3]] = !{!"llvm.loop.mustprogress"} +//. +// SIMD-ONLY-32: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]} +// SIMD-ONLY-32: [[META4]] = !{!"llvm.loop.mustprogress"} +//. diff --git a/clang/test/OpenMP/target_default_messages.cpp b/clang/test/OpenMP/target_default_messages.cpp index be677dffa21ca..6a1a1f99360b5 100644 --- a/clang/test/OpenMP/target_default_messages.cpp +++ b/clang/test/OpenMP/target_default_messages.cpp @@ -24,6 +24,8 @@ int main(int argc, char **argv) { for (int i=0; i<200; i++) foo(); #pragma omp target default(x) // expected-error {{expected 'none', 'shared', 'private' or 'firstprivate' in OpenMP clause 'default'}} for (int i=0; i<200; i++) foo(); +#pragma omp target default(none) // expected-note {{explicit data sharing attribute, data mapping attribute, or is_device_ptr clause requested here}} + x++; // expected-error {{variable 'x' must have explicitly specified data sharing attributes, data mapping attributes, or in an is_device_ptr clause}} #endif #ifdef OMP52 From 5b72096e96ff1fc7929d9e7e4a3755e6ff0fd584 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 11 Nov 2025 12:00:02 -0800 Subject: [PATCH 40/64] [Hexagon] Remove implicit conversions of MCRegister to unsigned. NFC (#167571) Use MCRegister instead of unsigned or use MCRegister::id() --- .../Hexagon/AsmParser/HexagonAsmParser.cpp | 2 +- .../Hexagon/MCTargetDesc/HexagonMCChecker.cpp | 20 +++++----- .../Hexagon/MCTargetDesc/HexagonMCChecker.h | 38 +++++++++---------- 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp index b94b1484205ae..c18db982bfd97 100644 --- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp +++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp @@ -463,7 +463,7 @@ void HexagonOperand::print(raw_ostream &OS, const MCAsmInfo &MAI) const { break; case Register: OS << ""; + OS << getReg().id() << ">"; break; case Token: OS << "'" << getToken() << "'"; diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp index 9b6bc5ade379d..0b2279bb2cfe6 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp @@ -385,7 +385,7 @@ bool HexagonMCChecker::checkSlots() { bool HexagonMCChecker::checkPredicates() { // Check for proper use of new predicate registers. for (const auto &I : NewPreds) { - unsigned P = I; + MCRegister P = I; if (!Defs.count(P) || LatePreds.count(P) || Defs.count(Hexagon::P3_0)) { // Error out if the new predicate register is not defined, @@ -398,7 +398,7 @@ bool HexagonMCChecker::checkPredicates() { // Check for proper use of auto-anded of predicate registers. for (const auto &I : LatePreds) { - unsigned P = I; + MCRegister P = I; if (LatePreds.count(P) > 1 || Defs.count(P)) { // Error out if predicate register defined "late" multiple times or @@ -607,7 +607,7 @@ void HexagonMCChecker::checkRegisterCurDefs() { bool HexagonMCChecker::checkRegisters() { // Check for proper register definitions. for (const auto &I : Defs) { - unsigned R = I.first; + MCRegister R = I.first; if (isLoopRegister(R) && Defs.count(R) > 1 && (HexagonMCInstrInfo::isInnerLoop(MCB) || @@ -620,8 +620,8 @@ bool HexagonMCChecker::checkRegisters() { if (SoftDefs.count(R)) { // Error out for explicit changes to registers also weakly defined // (e.g., "{ usr = r0; r0 = sfadd(...) }"). - unsigned UsrR = Hexagon::USR; // Silence warning about mixed types in ?:. - unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R; + MCRegister UsrR = Hexagon::USR; + MCRegister BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R; reportErrorRegisters(BadR); return false; } @@ -633,8 +633,8 @@ bool HexagonMCChecker::checkRegisters() { if (PM.count(Unconditional)) { // Error out on an unconditional change when there are any other // changes, conditional or not. - unsigned UsrR = Hexagon::USR; - unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R; + MCRegister UsrR = Hexagon::USR; + MCRegister BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R; reportErrorRegisters(BadR); return false; } @@ -664,7 +664,7 @@ bool HexagonMCChecker::checkRegisters() { // Check for use of temporary definitions. for (const auto &I : TmpDefs) { - unsigned R = I; + MCRegister R = I; if (!Uses.count(R)) { // special case for vhist @@ -765,12 +765,12 @@ void HexagonMCChecker::compoundRegisterMap(unsigned &Register) { } } -void HexagonMCChecker::reportErrorRegisters(unsigned Register) { +void HexagonMCChecker::reportErrorRegisters(MCRegister Register) { reportError("register `" + Twine(RI.getName(Register)) + "' modified more than once"); } -void HexagonMCChecker::reportErrorNewValue(unsigned Register) { +void HexagonMCChecker::reportErrorNewValue(MCRegister Register) { reportError("register `" + Twine(RI.getName(Register)) + "' used with `.new' " "but not validly modified in the same packet"); diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h index e9b87c5315fe4..8beee8d7ec8eb 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h @@ -39,41 +39,41 @@ class HexagonMCChecker { bool ReportErrors; /// Set of definitions: register #, if predicated, if predicated true. - using PredSense = std::pair; + using PredSense = std::pair; static const PredSense Unconditional; using PredSet = std::multiset; using PredSetIterator = std::multiset::iterator; - using DefsIterator = DenseMap::iterator; - DenseMap Defs; + using DefsIterator = DenseMap::iterator; + DenseMap Defs; /// Set of weak definitions whose clashes should be enforced selectively. - using SoftDefsIterator = std::set::iterator; - std::set SoftDefs; + using SoftDefsIterator = std::set::iterator; + std::set SoftDefs; /// Set of temporary definitions not committed to the register file. - using TmpDefsIterator = std::set::iterator; - std::set TmpDefs; + using TmpDefsIterator = std::set::iterator; + std::set TmpDefs; /// Set of new predicates used. - using NewPredsIterator = std::set::iterator; - std::set NewPreds; + using NewPredsIterator = std::set::iterator; + std::set NewPreds; /// Set of predicates defined late. - using LatePredsIterator = std::multiset::iterator; - std::multiset LatePreds; + using LatePredsIterator = std::multiset::iterator; + std::multiset LatePreds; /// Set of uses. - using UsesIterator = std::set::iterator; - std::set Uses; + using UsesIterator = std::set::iterator; + std::set Uses; /// Pre-defined set of read-only registers. - using ReadOnlyIterator = std::set::iterator; - std::set ReadOnly; + using ReadOnlyIterator = std::set::iterator; + std::set ReadOnly; // Contains the vector-pair-registers with the even number // first ("v0:1", e.g.) used/def'd in this packet. - std::set ReversePairs; + std::set ReversePairs; void init(); void init(MCInst const &); @@ -107,7 +107,7 @@ class HexagonMCChecker { static void compoundRegisterMap(unsigned &); - bool isLoopRegister(unsigned R) const { + bool isLoopRegister(MCRegister R) const { return (Hexagon::SA0 == R || Hexagon::LC0 == R || Hexagon::SA1 == R || Hexagon::LC1 == R); } @@ -120,8 +120,8 @@ class HexagonMCChecker { MCSubtargetInfo const &STI, bool CopyReportErrors); bool check(bool FullCheck = true); - void reportErrorRegisters(unsigned Register); - void reportErrorNewValue(unsigned Register); + void reportErrorRegisters(MCRegister Register); + void reportErrorNewValue(MCRegister Register); void reportError(SMLoc Loc, Twine const &Msg); void reportNote(SMLoc Loc, Twine const &Msg); void reportError(Twine const &Msg); From 99ed882a2efb449062ec96866c4a570f7bd778ce Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Tue, 11 Nov 2025 12:03:52 -0800 Subject: [PATCH 41/64] [HLSL] Wrap offset info into a dedicated type. NFC (#167396) Rather than using a nullable SmallVector, use a wrapper class for offset info. This simplifies places that need to handle whether or not there's any offset information. --- clang/lib/CodeGen/CGHLSLRuntime.cpp | 46 +++++++------------ clang/lib/CodeGen/CGHLSLRuntime.h | 35 ++++++++++++-- clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp | 37 +++++++++------ clang/lib/CodeGen/HLSLBufferLayoutBuilder.h | 10 ++-- clang/lib/CodeGen/TargetInfo.h | 6 +-- clang/lib/CodeGen/Targets/DirectX.cpp | 9 ++-- clang/lib/CodeGen/Targets/SPIR.cpp | 9 ++-- 7 files changed, 88 insertions(+), 64 deletions(-) diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp index e392a12044a39..4bdba9b3da502 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.cpp +++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp @@ -261,12 +261,12 @@ static std::optional initializeLocalResourceArray( llvm::Type * CGHLSLRuntime::convertHLSLSpecificType(const Type *T, - SmallVector *Packoffsets) { + const CGHLSLOffsetInfo &OffsetInfo) { assert(T->isHLSLSpecificType() && "Not an HLSL specific type!"); // Check if the target has a specific translation for this type first. if (llvm::Type *TargetTy = - CGM.getTargetCodeGenInfo().getHLSLType(CGM, T, Packoffsets)) + CGM.getTargetCodeGenInfo().getHLSLType(CGM, T, OffsetInfo)) return TargetTy; llvm_unreachable("Generic handling of HLSL types is not supported."); @@ -357,25 +357,14 @@ createBufferHandleType(const HLSLBufferDecl *BufDecl) { return cast(QT.getTypePtr()); } -// Iterates over all declarations in the HLSL buffer and based on the -// packoffset or register(c#) annotations it fills outs the Layout -// vector with the user-specified layout offsets. -// The buffer offsets can be specified 2 ways: -// 1. declarations in cbuffer {} block can have a packoffset annotation -// (translates to HLSLPackOffsetAttr) -// 2. default constant buffer declarations at global scope can have -// register(c#) annotations (translates to HLSLResourceBindingAttr with -// RegisterType::C) -// It is not guaranteed that all declarations in a buffer have an annotation. -// For those where it is not specified a -1 value is added to the Layout -// vector. In the final layout these declarations will be placed at the end -// of the HLSL buffer after all of the elements with specified offset. -static void fillPackoffsetLayout(const HLSLBufferDecl *BufDecl, - SmallVector &Layout) { - assert(Layout.empty() && "expected empty vector for layout"); - assert(BufDecl->hasValidPackoffset()); +CGHLSLOffsetInfo CGHLSLOffsetInfo::fromDecl(const HLSLBufferDecl &BufDecl) { + CGHLSLOffsetInfo Result; - for (Decl *D : BufDecl->buffer_decls()) { + // If we don't have packoffset info, just return an empty result. + if (!BufDecl.hasValidPackoffset()) + return Result; + + for (Decl *D : BufDecl.buffer_decls()) { if (isa(D) || isa(D)) { continue; } @@ -384,11 +373,11 @@ static void fillPackoffsetLayout(const HLSLBufferDecl *BufDecl, continue; if (!VD->hasAttrs()) { - Layout.push_back(-1); + Result.Offsets.push_back(Unspecified); continue; } - int32_t Offset = -1; + uint32_t Offset = Unspecified; for (auto *Attr : VD->getAttrs()) { if (auto *POA = dyn_cast(Attr)) { Offset = POA->getOffsetInBytes(); @@ -401,8 +390,9 @@ static void fillPackoffsetLayout(const HLSLBufferDecl *BufDecl, break; } } - Layout.push_back(Offset); + Result.Offsets.push_back(Offset); } + return Result; } // Codegen for HLSLBufferDecl @@ -419,13 +409,9 @@ void CGHLSLRuntime::addBuffer(const HLSLBufferDecl *BufDecl) { return; // create global variable for the constant buffer - SmallVector Layout; - if (BufDecl->hasValidPackoffset()) - fillPackoffsetLayout(BufDecl, Layout); - - llvm::TargetExtType *TargetTy = - cast(convertHLSLSpecificType( - ResHandleTy, BufDecl->hasValidPackoffset() ? &Layout : nullptr)); + CGHLSLOffsetInfo OffsetInfo = CGHLSLOffsetInfo::fromDecl(*BufDecl); + llvm::TargetExtType *TargetTy = cast( + convertHLSLSpecificType(ResHandleTy, OffsetInfo)); llvm::GlobalVariable *BufGV = new GlobalVariable( TargetTy, /*isConstant*/ false, GlobalValue::LinkageTypes::ExternalLinkage, PoisonValue::get(TargetTy), diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h index 9d31714ab8606..488a322ca7569 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.h +++ b/clang/lib/CodeGen/CGHLSLRuntime.h @@ -81,6 +81,33 @@ class CodeGenModule; class CodeGenFunction; class LValue; +class CGHLSLOffsetInfo { + SmallVector Offsets; + +public: + static const uint32_t Unspecified = ~0U; + + /// Iterates over all declarations in the HLSL buffer and based on the + /// packoffset or register(c#) annotations it fills outs the Offsets vector + /// with the user-specified layout offsets. The buffer offsets can be + /// specified 2 ways: 1. declarations in cbuffer {} block can have a + /// packoffset annotation (translates to HLSLPackOffsetAttr) 2. default + /// constant buffer declarations at global scope can have register(c#) + /// annotations (translates to HLSLResourceBindingAttr with RegisterType::C) + /// It is not guaranteed that all declarations in a buffer have an annotation. + /// For those where it is not specified a `~0U` value is added to the Offsets + /// vector. In the final layout these declarations will be placed at the end + /// of the HLSL buffer after all of the elements with specified offset. + static CGHLSLOffsetInfo fromDecl(const HLSLBufferDecl &BufDecl); + + /// Get the given offset, or `~0U` if there is no offset for the member. + uint32_t operator[](size_t I) const { + if (Offsets.empty()) + return Unspecified; + return Offsets[I]; + } +}; + class CGHLSLRuntime { public: //===----------------------------------------------------------------------===// @@ -167,9 +194,11 @@ class CGHLSLRuntime { CGHLSLRuntime(CodeGenModule &CGM) : CGM(CGM) {} virtual ~CGHLSLRuntime() {} - llvm::Type * - convertHLSLSpecificType(const Type *T, - SmallVector *Packoffsets = nullptr); + llvm::Type *convertHLSLSpecificType(const Type *T, + const CGHLSLOffsetInfo &OffsetInfo); + llvm::Type *convertHLSLSpecificType(const Type *T) { + return convertHLSLSpecificType(T, CGHLSLOffsetInfo()); + } void generateGlobalCtorDtorCalls(); diff --git a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp index 838903cdcd1ee..4bc6d565fd41f 100644 --- a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp +++ b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp @@ -66,8 +66,9 @@ namespace CodeGen { // annotation though. For those that don't, the PackOffsets array will contain // -1 value instead. These elements must be placed at the end of the layout // after all of the elements with specific offset. -llvm::TargetExtType *HLSLBufferLayoutBuilder::createLayoutType( - const RecordType *RT, const llvm::SmallVector *PackOffsets) { +llvm::TargetExtType * +HLSLBufferLayoutBuilder::createLayoutType(const RecordType *RT, + const CGHLSLOffsetInfo &OffsetInfo) { // check if we already have the layout type for this struct if (llvm::TargetExtType *Ty = @@ -101,14 +102,10 @@ llvm::TargetExtType *HLSLBufferLayoutBuilder::createLayoutType( const CXXRecordDecl *RD = RecordDecls.pop_back_val(); for (const auto *FD : RD->fields()) { - assert((!PackOffsets || Index < PackOffsets->size()) && - "number of elements in layout struct does not match number of " - "packoffset annotations"); - // No PackOffset info at all, or have a valid packoffset/register(c#) // annotations value -> layout the field. - const int PO = PackOffsets ? (*PackOffsets)[Index++] : -1; - if (!PackOffsets || PO != -1) { + const uint32_t PO = OffsetInfo[Index++]; + if (PO != CGHLSLOffsetInfo::Unspecified) { if (!layoutField(FD, EndOffset, FieldOffset, FieldType, PO)) return nullptr; Layout.push_back(FieldOffset); @@ -175,7 +172,7 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD, unsigned &EndOffset, unsigned &FieldOffset, llvm::Type *&FieldType, - int Packoffset) { + uint32_t Packoffset) { // Size of element; for arrays this is a size of a single element in the // array. Total array size of calculated as (ArrayCount-1) * ArrayStride + @@ -201,8 +198,9 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD, // For array of structures, create a new array with a layout type // instead of the structure type. if (Ty->isStructureOrClassType()) { + CGHLSLOffsetInfo EmptyOffsets; llvm::Type *NewTy = cast( - createLayoutType(Ty->getAsCanonical())); + createLayoutType(Ty->getAsCanonical(), EmptyOffsets)); if (!NewTy) return false; assert(isa(NewTy) && "expected target type"); @@ -216,17 +214,20 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD, ElemLayoutTy = CGM.getTypes().ConvertTypeForMem(FieldTy); } ArrayStride = llvm::alignTo(ElemSize, CBufferRowSizeInBytes); - ElemOffset = (Packoffset != -1) ? Packoffset : NextRowOffset; + ElemOffset = (Packoffset != CGHLSLOffsetInfo::Unspecified) ? Packoffset + : NextRowOffset; } else if (FieldTy->isStructureOrClassType()) { // Create a layout type for the structure + CGHLSLOffsetInfo EmptyOffsets; ElemLayoutTy = createLayoutType( - cast(FieldTy->getAsCanonical())); + cast(FieldTy->getAsCanonical()), EmptyOffsets); if (!ElemLayoutTy) return false; assert(isa(ElemLayoutTy) && "expected target type"); ElemSize = cast(ElemLayoutTy)->getIntParameter(0); - ElemOffset = (Packoffset != -1) ? Packoffset : NextRowOffset; + ElemOffset = (Packoffset != CGHLSLOffsetInfo::Unspecified) ? Packoffset + : NextRowOffset; } else { // scalar or vector - find element size and alignment @@ -246,7 +247,7 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD, } // calculate or get element offset for the vector or scalar - if (Packoffset != -1) { + if (Packoffset != CGHLSLOffsetInfo::Unspecified) { ElemOffset = Packoffset; } else { ElemOffset = llvm::alignTo(EndOffset, Align); @@ -269,5 +270,13 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD, return true; } +bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD, + unsigned &EndOffset, + unsigned &FieldOffset, + llvm::Type *&FieldType) { + return layoutField(FD, EndOffset, FieldOffset, FieldType, + CGHLSLOffsetInfo::Unspecified); +} + } // namespace CodeGen } // namespace clang diff --git a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h index 61240b280cfcb..916e60e83e2c0 100644 --- a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h +++ b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h @@ -14,6 +14,7 @@ class RecordType; class FieldDecl; namespace CodeGen { +class CGHLSLOffsetInfo; class CodeGenModule; //===----------------------------------------------------------------------===// @@ -33,14 +34,15 @@ class HLSLBufferLayoutBuilder { // Returns LLVM target extension type with the name LayoutTypeName // for given structure type and layout data. The first number in // the Layout is the size followed by offsets for each struct element. - llvm::TargetExtType * - createLayoutType(const RecordType *StructType, - const llvm::SmallVector *Packoffsets = nullptr); + llvm::TargetExtType *createLayoutType(const RecordType *StructType, + const CGHLSLOffsetInfo &OffsetInfo); private: bool layoutField(const clang::FieldDecl *FD, unsigned &EndOffset, unsigned &FieldOffset, llvm::Type *&FieldType, - int Packoffset = -1); + uint32_t Packoffset); + bool layoutField(const clang::FieldDecl *FD, unsigned &EndOffset, + unsigned &FieldOffset, llvm::Type *&FieldType); }; } // namespace CodeGen diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h index f63e900669d97..383f52f298d2e 100644 --- a/clang/lib/CodeGen/TargetInfo.h +++ b/clang/lib/CodeGen/TargetInfo.h @@ -39,6 +39,7 @@ class ABIInfo; class CallArgList; class CodeGenFunction; class CGBlockInfo; +class CGHLSLOffsetInfo; class SwiftABIInfo; /// TargetCodeGenInfo - This class organizes various target-specific @@ -442,9 +443,8 @@ class TargetCodeGenInfo { } /// Return an LLVM type that corresponds to a HLSL type - virtual llvm::Type * - getHLSLType(CodeGenModule &CGM, const Type *T, - const SmallVector *Packoffsets = nullptr) const { + virtual llvm::Type *getHLSLType(CodeGenModule &CGM, const Type *T, + const CGHLSLOffsetInfo &OffsetInfo) const { return nullptr; } diff --git a/clang/lib/CodeGen/Targets/DirectX.cpp b/clang/lib/CodeGen/Targets/DirectX.cpp index b4cebb9a32aca..f30b30284cb12 100644 --- a/clang/lib/CodeGen/Targets/DirectX.cpp +++ b/clang/lib/CodeGen/Targets/DirectX.cpp @@ -29,14 +29,13 @@ class DirectXTargetCodeGenInfo : public TargetCodeGenInfo { DirectXTargetCodeGenInfo(CodeGen::CodeGenTypes &CGT) : TargetCodeGenInfo(std::make_unique(CGT)) {} - llvm::Type * - getHLSLType(CodeGenModule &CGM, const Type *T, - const SmallVector *Packoffsets = nullptr) const override; + llvm::Type *getHLSLType(CodeGenModule &CGM, const Type *T, + const CGHLSLOffsetInfo &OffsetInfo) const override; }; llvm::Type *DirectXTargetCodeGenInfo::getHLSLType( CodeGenModule &CGM, const Type *Ty, - const SmallVector *Packoffsets) const { + const CGHLSLOffsetInfo &OffsetInfo) const { auto *ResType = dyn_cast(Ty); if (!ResType) return nullptr; @@ -78,7 +77,7 @@ llvm::Type *DirectXTargetCodeGenInfo::getHLSLType( llvm::Type *BufferLayoutTy = HLSLBufferLayoutBuilder(CGM, "dx.Layout") .createLayoutType(ContainedTy->castAsCanonical(), - Packoffsets); + OffsetInfo); if (!BufferLayoutTy) return nullptr; diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp index 161a944b16bda..be7e9ccecae9f 100644 --- a/clang/lib/CodeGen/Targets/SPIR.cpp +++ b/clang/lib/CodeGen/Targets/SPIR.cpp @@ -53,9 +53,8 @@ class CommonSPIRTargetCodeGenInfo : public TargetCodeGenInfo { unsigned getDeviceKernelCallingConv() const override; llvm::Type *getOpenCLType(CodeGenModule &CGM, const Type *T) const override; - llvm::Type * - getHLSLType(CodeGenModule &CGM, const Type *Ty, - const SmallVector *Packoffsets = nullptr) const override; + llvm::Type *getHLSLType(CodeGenModule &CGM, const Type *Ty, + const CGHLSLOffsetInfo &OffsetInfo) const override; llvm::Type *getSPIRVImageTypeFromHLSLResource( const HLSLAttributedResourceType::Attributes &attributes, QualType SampledType, CodeGenModule &CGM) const; @@ -518,7 +517,7 @@ static llvm::Type *getInlineSpirvType(CodeGenModule &CGM, llvm::Type *CommonSPIRTargetCodeGenInfo::getHLSLType( CodeGenModule &CGM, const Type *Ty, - const SmallVector *Packoffsets) const { + const CGHLSLOffsetInfo &OffsetInfo) const { llvm::LLVMContext &Ctx = CGM.getLLVMContext(); if (auto *SpirvType = dyn_cast(Ty)) @@ -567,7 +566,7 @@ llvm::Type *CommonSPIRTargetCodeGenInfo::getHLSLType( llvm::Type *BufferLayoutTy = HLSLBufferLayoutBuilder(CGM, "spirv.Layout") .createLayoutType(ContainedTy->castAsCanonical(), - Packoffsets); + OffsetInfo); uint32_t StorageClass = /* Uniform storage class */ 2; return llvm::TargetExtType::get(Ctx, "spirv.VulkanBuffer", {BufferLayoutTy}, {StorageClass, false}); From 7cd9d3df73b168359b222d5e490425bb08f0eb22 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Tue, 11 Nov 2025 12:05:39 -0800 Subject: [PATCH 42/64] [Github] Make Bazel Build/Test use GCS Cache (#167044) A bucket was added in https://github.com/llvm/llvm-zorg/pull/650. Wire it up in the job so we can actually take advantage of it. --- .github/workflows/bazel-checks.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/bazel-checks.yml b/.github/workflows/bazel-checks.yml index 7c3db4ed7865f..1b27dbc1dbc4d 100644 --- a/.github/workflows/bazel-checks.yml +++ b/.github/workflows/bazel-checks.yml @@ -52,4 +52,6 @@ jobs: working-directory: utils/bazel run: | bazelisk test --config=ci --sandbox_base="" \ + --remote_cache=https://storage.googleapis.com/$CACHE_GCS_BUCKET-bazel \ + --google_default_credentials \ @llvm-project//llvm/unittests:adt_tests From 75ef0be0c3b6b0313d541b2af673ee4bb091572b Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 11 Nov 2025 10:17:31 -0800 Subject: [PATCH 43/64] [SLP]Be careful when trying match/vectorize copyable nodes with external uses only Need to be careful when trying to match and/or build copyable node with the instructions, used outside the block only and if their operands immediately precede such instructions. In this case insertion point might be the same and it may cause broken def-use chain. Fixes #167366 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 34 ++++++++++- ...copyable-used-outside-with-immediate-op.ll | 57 +++++++++++++++++++ 2 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/copyable-used-outside-with-immediate-op.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index ffba0bdbdbe83..cc53b0dd3577e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -16844,6 +16844,16 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( } return false; }; + auto CheckNonSchedulableOrdering = [&](const TreeEntry *E, + Instruction *InsertPt) { + return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() && + !TEUseEI.UserTE->isCopyableElement( + const_cast(TEInsertPt)) && + isUsedOutsideBlock(const_cast(TEInsertPt)) && + InsertPt->getNextNode() == TEInsertPt && + (!E->hasCopyableElements() || !E->isCopyableElement(InsertPt) || + !isUsedOutsideBlock(InsertPt)); + }; for (Value *V : VL) { if (isConstant(V) || !VisitedValue.insert(V).second) continue; @@ -16926,6 +16936,11 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( // The node is reused - exit. if (CheckAndUseSameNode(TEPtr)) break; + // The parent node is copyable with last inst used outside? And the last + // inst is the next inst for the lastinst of TEPtr? Exit, if yes, to + // preserve def-use chain. + if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt)) + continue; VToTEs.insert(TEPtr); } if (ArrayRef VTEs = getSplitTreeEntries(V); !VTEs.empty()) { @@ -16961,7 +16976,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( if (none_of(TE->CombinedEntriesWithIndices, [&](const auto &P) { return P.first == VTE->Idx; })) { Instruction &LastBundleInst = getLastInstructionInBundle(VTE); - if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst)) + if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst) || + CheckNonSchedulableOrdering(VTE, &LastBundleInst)) continue; } // The node is reused - exit. @@ -21003,6 +21019,22 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, return isUsedOutsideBlock(V); })) return std::nullopt; + // If any instruction is used outside block only and its operand is placed + // immediately before it, do not schedule, it may cause wrong def-use chain. + if (S.areInstructionsWithCopyableElements() && any_of(VL, [&](Value *V) { + if (isa(V) || S.isCopyableElement(V)) + return false; + if (isUsedOutsideBlock(V)) { + for (Value *Op : cast(V)->operands()) { + auto *I = dyn_cast(Op); + if (!I) + continue; + return SLP->isVectorized(I) && I->getNextNode() == V; + } + } + return false; + })) + return std::nullopt; bool HasCopyables = S.areInstructionsWithCopyableElements(); if (((!HasCopyables && doesNotNeedToSchedule(VL)) || all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/copyable-used-outside-with-immediate-op.ll b/llvm/test/Transforms/SLPVectorizer/X86/copyable-used-outside-with-immediate-op.ll new file mode 100644 index 0000000000000..d4aef24962313 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/copyable-used-outside-with-immediate-op.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-9999 < %s | FileCheck %s + +define void @test() { +; CHECK-LABEL: define void @test() { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: br label %[[BB1:.*]] +; CHECK: [[BB1]]: +; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ , %[[BB]] ], [ [[TMP6:%.*]], %[[BB14:.*]] ], [ , %[[BB10:.*]] ] +; CHECK-NEXT: br label %[[BB3:.*]] +; CHECK: [[BB3]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x float> [ zeroinitializer, %[[BB1]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ [[TMP0]], %[[BB1]] ] +; CHECK-NEXT: br label %[[BB10]] +; CHECK: [[BB10]]: +; CHECK-NEXT: [[PHI12:%.*]] = phi float [ 0.000000e+00, %[[BB3]] ], [ 0.000000e+00, %[[BB14]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x i32> [ , %[[BB3]] ], [ [[TMP7:%.*]], %[[BB14]] ] +; CHECK-NEXT: switch i32 0, label %[[BB14]] [ +; CHECK-NEXT: i32 0, label %[[BB1]] +; CHECK-NEXT: ] +; CHECK: [[BB14]]: +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6]] = or <4 x i32> [[TMP5]], +; CHECK-NEXT: [[TMP7]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: br i1 false, label %[[BB1]], label %[[BB10]] +; +bb: + br label %bb1 + +bb1: + %phi = phi i32 [ 0, %bb ], [ %or16, %bb14 ], [ 0, %bb10 ] + %phi2 = phi i32 [ 0, %bb ], [ %or15, %bb14 ], [ 0, %bb10 ] + br label %bb3 + +bb3: ; preds = %bb1 + %phi4 = phi i32 [ poison, %bb1 ] + %phi6 = phi i32 [ poison, %bb1 ] + %phi7 = phi i32 [ %phi, %bb1 ] + %phi9 = phi i32 [ %phi2, %bb1 ] + %0 = phi <2 x float> [ zeroinitializer, %bb1 ] + br label %bb10 + +bb10: + %phi11 = phi i32 [ 0, %bb3 ], [ %phi11, %bb14 ] + %phi12 = phi float [ 0.000000e+00, %bb3 ], [ 0.000000e+00, %bb14 ] + %phi13 = phi i32 [ 0, %bb3 ], [ %or15, %bb14 ] + switch i32 0, label %bb14 [ + i32 0, label %bb1 + ] + +bb14: + %or = or i32 %phi13, %phi11 + %or15 = or i32 %or, 0 + %or16 = or i32 %phi11, 0 + br i1 false, label %bb1, label %bb10 +} From 1e467e44851a9da96c16c0dcd16725f996e6abf7 Mon Sep 17 00:00:00 2001 From: Med Ismail Bennani Date: Tue, 11 Nov 2025 12:18:45 -0800 Subject: [PATCH 44/64] [lldb] Introduce ScriptedFrameProvider for real threads (#161870) This patch extends ScriptedFrame to work with real (non-scripted) threads, enabling frame providers to synthesize frames for native processes. Previously, ScriptedFrame only worked within ScriptedProcess/ScriptedThread contexts. This patch decouples ScriptedFrame from ScriptedThread, allowing users to augment or replace stack frames in real debugging sessions for use cases like custom calling conventions, reconstructing corrupted frames from core files, or adding diagnostic frames. Key changes: - ScriptedFrame::Create() now accepts ThreadSP instead of requiring ScriptedThread, extracting architecture from the target triple rather than ScriptedProcess.arch - Added SBTarget::RegisterScriptedFrameProvider() and ClearScriptedFrameProvider() APIs, with Target storing a SyntheticFrameProviderDescriptor template for new threads - Added "target frame-provider register/clear" commands for CLI access - Thread class gains LoadScriptedFrameProvider(), ClearScriptedFrameProvider(), and GetFrameProvider() methods for per-thread frame provider management - New SyntheticStackFrameList overrides FetchFramesUpTo() to lazily provide frames from either the frame provider or the real stack This enables practical use of the SyntheticFrameProvider infrastructure in real debugging workflows. rdar://161834688 Signed-off-by: Med Ismail Bennani Signed-off-by: Med Ismail Bennani --- lldb/bindings/python/python-wrapper.swig | 12 + .../templates/scripted_frame_provider.py | 47 +++ .../python/templates/scripted_process.py | 47 ++- lldb/include/lldb/API/SBTarget.h | 30 ++ lldb/include/lldb/API/SBThread.h | 1 + lldb/include/lldb/API/SBThreadCollection.h | 1 + .../ScriptedFrameProviderInterface.h | 18 + .../lldb/Interpreter/ScriptInterpreter.h | 3 + lldb/include/lldb/Target/StackFrame.h | 7 +- lldb/include/lldb/Target/StackFrameList.h | 36 +- .../lldb/Target/SyntheticFrameProvider.h | 30 +- lldb/include/lldb/Target/Target.h | 38 ++ lldb/include/lldb/Target/Thread.h | 12 + lldb/include/lldb/Target/ThreadSpec.h | 2 + lldb/include/lldb/Utility/ScriptedMetadata.h | 27 ++ lldb/include/lldb/lldb-private-interfaces.h | 4 +- lldb/source/API/SBTarget.cpp | 82 +++++ lldb/source/Commands/CommandObjectTarget.cpp | 200 +++++++++++ lldb/source/Interpreter/ScriptInterpreter.cpp | 7 + lldb/source/Plugins/CMakeLists.txt | 1 + .../Process/scripted/ScriptedFrame.cpp | 85 +++-- .../Plugins/Process/scripted/ScriptedFrame.h | 33 +- .../Process/scripted/ScriptedThread.cpp | 6 +- .../ScriptInterpreterPythonInterfaces.cpp | 2 + .../ScriptedFrameProviderPythonInterface.cpp | 58 ++- .../ScriptedFrameProviderPythonInterface.h | 23 +- .../Interfaces/ScriptedPythonInterface.cpp | 13 + .../Interfaces/ScriptedPythonInterface.h | 121 ++++++- .../Python/SWIGPythonBridge.h | 1 + .../SyntheticFrameProvider/CMakeLists.txt | 1 + .../ScriptedFrameProvider/CMakeLists.txt | 12 + .../ScriptedFrameProvider.cpp | 215 +++++++++++ .../ScriptedFrameProvider.h | 53 +++ lldb/source/Target/StackFrameList.cpp | 35 ++ lldb/source/Target/SyntheticFrameProvider.cpp | 25 +- lldb/source/Target/Target.cpp | 55 +++ lldb/source/Target/Thread.cpp | 72 +++- lldb/source/Target/ThreadSpec.cpp | 4 + .../scripted_frame_provider/Makefile | 3 + .../TestScriptedFrameProvider.py | 339 ++++++++++++++++++ .../scripted_frame_provider/main.cpp | 55 +++ .../test_frame_providers.py | 176 +++++++++ .../Python/PythonTestSuite.cpp | 5 + 43 files changed, 1918 insertions(+), 79 deletions(-) create mode 100644 lldb/source/Plugins/SyntheticFrameProvider/CMakeLists.txt create mode 100644 lldb/source/Plugins/SyntheticFrameProvider/ScriptedFrameProvider/CMakeLists.txt create mode 100644 lldb/source/Plugins/SyntheticFrameProvider/ScriptedFrameProvider/ScriptedFrameProvider.cpp create mode 100644 lldb/source/Plugins/SyntheticFrameProvider/ScriptedFrameProvider/ScriptedFrameProvider.h create mode 100644 lldb/test/API/functionalities/scripted_frame_provider/Makefile create mode 100644 lldb/test/API/functionalities/scripted_frame_provider/TestScriptedFrameProvider.py create mode 100644 lldb/test/API/functionalities/scripted_frame_provider/main.cpp create mode 100644 lldb/test/API/functionalities/scripted_frame_provider/test_frame_providers.py diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig index 3a0995e84f643..84fb3a95c0942 100644 --- a/lldb/bindings/python/python-wrapper.swig +++ b/lldb/bindings/python/python-wrapper.swig @@ -422,6 +422,18 @@ void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBBreakpoint(PyObject * return sb_ptr; } +void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBThread(PyObject * data) { + lldb::SBThread *sb_ptr = nullptr; + + int valid_cast = + SWIG_ConvertPtr(data, (void **)&sb_ptr, SWIGTYPE_p_lldb__SBThread, 0); + + if (valid_cast == -1) + return NULL; + + return sb_ptr; +} + void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBFrame(PyObject * data) { lldb::SBFrame *sb_ptr = nullptr; diff --git a/lldb/examples/python/templates/scripted_frame_provider.py b/lldb/examples/python/templates/scripted_frame_provider.py index 20f4d76d188c2..7a72f1a24c9da 100644 --- a/lldb/examples/python/templates/scripted_frame_provider.py +++ b/lldb/examples/python/templates/scripted_frame_provider.py @@ -31,7 +31,54 @@ class ScriptedFrameProvider(metaclass=ABCMeta): ) """ + @staticmethod + def applies_to_thread(thread): + """Determine if this frame provider should be used for a given thread. + + This static method is called before creating an instance of the frame + provider to determine if it should be applied to a specific thread. + Override this method to provide custom filtering logic. + + Args: + thread (lldb.SBThread): The thread to check. + + Returns: + bool: True if this frame provider should be used for the thread, + False otherwise. The default implementation returns True for + all threads. + + Example: + + .. code-block:: python + + @staticmethod + def applies_to_thread(thread): + # Only apply to thread 1 + return thread.GetIndexID() == 1 + """ + return True + + @staticmethod @abstractmethod + def get_description(): + """Get a description of this frame provider. + + This method should return a human-readable string describing what + this frame provider does. The description is used for debugging + and display purposes. + + Returns: + str: A description of the frame provider. + + Example: + + .. code-block:: python + + def get_description(self): + return "Crash log frame provider for thread 1" + """ + pass + def __init__(self, input_frames, args): """Construct a scripted frame provider. diff --git a/lldb/examples/python/templates/scripted_process.py b/lldb/examples/python/templates/scripted_process.py index 49059d533f38a..136edce165140 100644 --- a/lldb/examples/python/templates/scripted_process.py +++ b/lldb/examples/python/templates/scripted_process.py @@ -245,6 +245,7 @@ def __init__(self, process, args): key/value pairs used by the scripted thread. """ self.target = None + self.arch = None self.originating_process = None self.process = None self.args = None @@ -266,6 +267,9 @@ def __init__(self, process, args): and process.IsValid() ): self.target = process.target + triple = self.target.triple + if triple: + self.arch = triple.split("-")[0] self.originating_process = process self.process = self.target.GetProcess() self.get_register_info() @@ -352,17 +356,14 @@ def get_stackframes(self): def get_register_info(self): if self.register_info is None: self.register_info = dict() - if "x86_64" in self.originating_process.arch: + if "x86_64" in self.arch: self.register_info["sets"] = ["General Purpose Registers"] self.register_info["registers"] = INTEL64_GPR - elif ( - "arm64" in self.originating_process.arch - or self.originating_process.arch == "aarch64" - ): + elif "arm64" in self.arch or self.arch == "aarch64": self.register_info["sets"] = ["General Purpose Registers"] self.register_info["registers"] = ARM64_GPR else: - raise ValueError("Unknown architecture", self.originating_process.arch) + raise ValueError("Unknown architecture", self.arch) return self.register_info @abstractmethod @@ -405,11 +406,12 @@ def __init__(self, thread, args): """Construct a scripted frame. Args: - thread (ScriptedThread): The thread owning this frame. + thread (ScriptedThread/lldb.SBThread): The thread owning this frame. args (lldb.SBStructuredData): A Dictionary holding arbitrary key/value pairs used by the scripted frame. """ self.target = None + self.arch = None self.originating_thread = None self.thread = None self.args = None @@ -419,15 +421,17 @@ def __init__(self, thread, args): self.register_ctx = {} self.variables = [] - if ( - isinstance(thread, ScriptedThread) - or isinstance(thread, lldb.SBThread) - and thread.IsValid() + if isinstance(thread, ScriptedThread) or ( + isinstance(thread, lldb.SBThread) and thread.IsValid() ): - self.target = thread.target self.process = thread.process + self.target = self.process.target + triple = self.target.triple + if triple: + self.arch = triple.split("-")[0] + tid = thread.tid if isinstance(thread, ScriptedThread) else thread.id self.originating_thread = thread - self.thread = self.process.GetThreadByIndexID(thread.tid) + self.thread = self.process.GetThreadByIndexID(tid) self.get_register_info() @abstractmethod @@ -508,7 +512,18 @@ def get_variables(self, filters): def get_register_info(self): if self.register_info is None: - self.register_info = self.originating_thread.get_register_info() + if isinstance(self.originating_thread, ScriptedThread): + self.register_info = self.originating_thread.get_register_info() + elif isinstance(self.originating_thread, lldb.SBThread): + self.register_info = dict() + if "x86_64" in self.arch: + self.register_info["sets"] = ["General Purpose Registers"] + self.register_info["registers"] = INTEL64_GPR + elif "arm64" in self.arch or self.arch == "aarch64": + self.register_info["sets"] = ["General Purpose Registers"] + self.register_info["registers"] = ARM64_GPR + else: + raise ValueError("Unknown architecture", self.arch) return self.register_info @abstractmethod @@ -642,12 +657,12 @@ def get_stop_reason(self): # TODO: Passthrough stop reason from driving process if self.driving_thread.GetStopReason() != lldb.eStopReasonNone: - if "arm64" in self.originating_process.arch: + if "arm64" in self.arch: stop_reason["type"] = lldb.eStopReasonException stop_reason["data"]["desc"] = ( self.driving_thread.GetStopDescription(100) ) - elif self.originating_process.arch == "x86_64": + elif self.arch == "x86_64": stop_reason["type"] = lldb.eStopReasonSignal stop_reason["data"]["signal"] = signal.SIGTRAP else: diff --git a/lldb/include/lldb/API/SBTarget.h b/lldb/include/lldb/API/SBTarget.h index 379a0bb7e9513..c2fd3e2f50e3b 100644 --- a/lldb/include/lldb/API/SBTarget.h +++ b/lldb/include/lldb/API/SBTarget.h @@ -19,6 +19,7 @@ #include "lldb/API/SBLaunchInfo.h" #include "lldb/API/SBStatisticsOptions.h" #include "lldb/API/SBSymbolContextList.h" +#include "lldb/API/SBThreadCollection.h" #include "lldb/API/SBType.h" #include "lldb/API/SBValue.h" #include "lldb/API/SBWatchpoint.h" @@ -986,6 +987,35 @@ class LLDB_API SBTarget { lldb::SBMutex GetAPIMutex() const; + /// Register a scripted frame provider for this target. + /// If a scripted frame provider with the same name and same argument + /// dictionary is already registered on this target, it will be overwritten. + /// + /// \param[in] class_name + /// The name of the Python class that implements the frame provider. + /// + /// \param[in] args_dict + /// A dictionary of arguments to pass to the frame provider class. + /// + /// \param[out] error + /// An error object indicating success or failure. + /// + /// \return + /// A unique identifier for the frame provider descriptor that was + /// registered. 0 if the registration failed. + uint32_t RegisterScriptedFrameProvider(const char *class_name, + lldb::SBStructuredData args_dict, + lldb::SBError &error); + + /// Remove a scripted frame provider from this target by name. + /// + /// \param[in] provider_id + /// The id of the frame provider class to remove. + /// + /// \return + /// An error object indicating success or failure. + lldb::SBError RemoveScriptedFrameProvider(uint32_t provider_id); + protected: friend class SBAddress; friend class SBAddressRange; diff --git a/lldb/include/lldb/API/SBThread.h b/lldb/include/lldb/API/SBThread.h index f6a6d19935b83..639e7a0a1a5c0 100644 --- a/lldb/include/lldb/API/SBThread.h +++ b/lldb/include/lldb/API/SBThread.h @@ -256,6 +256,7 @@ class LLDB_API SBThread { friend class SBThreadPlan; friend class SBTrace; + friend class lldb_private::ScriptInterpreter; friend class lldb_private::python::SWIGBridge; SBThread(const lldb::ThreadSP &lldb_object_sp); diff --git a/lldb/include/lldb/API/SBThreadCollection.h b/lldb/include/lldb/API/SBThreadCollection.h index 5a052e6246026..d13dea0f11cd2 100644 --- a/lldb/include/lldb/API/SBThreadCollection.h +++ b/lldb/include/lldb/API/SBThreadCollection.h @@ -46,6 +46,7 @@ class LLDB_API SBThreadCollection { void SetOpaque(const lldb::ThreadCollectionSP &threads); private: + friend class SBTarget; friend class SBProcess; friend class SBThread; friend class SBSaveCoreOptions; diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h index 2d9f713676f90..49b60131399d5 100644 --- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h +++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h @@ -16,11 +16,29 @@ namespace lldb_private { class ScriptedFrameProviderInterface : public ScriptedInterface { public: + virtual bool AppliesToThread(llvm::StringRef class_name, + lldb::ThreadSP thread_sp) { + return true; + } + virtual llvm::Expected CreatePluginObject(llvm::StringRef class_name, lldb::StackFrameListSP input_frames, StructuredData::DictionarySP args_sp) = 0; + /// Get a description string for the frame provider. + /// + /// This is called by the descriptor to fetch a description from the + /// scripted implementation. Implementations should call a static method + /// on the scripting class to retrieve the description. + /// + /// \param class_name The name of the scripting class implementing the + /// provider. + /// + /// \return A string describing what this frame provider does, or an + /// empty string if no description is available. + virtual std::string GetDescription(llvm::StringRef class_name) { return {}; } + virtual StructuredData::ObjectSP GetFrameAtIndex(uint32_t index) { return {}; } diff --git a/lldb/include/lldb/Interpreter/ScriptInterpreter.h b/lldb/include/lldb/Interpreter/ScriptInterpreter.h index 7fed4940b85bf..0b91d6756552d 100644 --- a/lldb/include/lldb/Interpreter/ScriptInterpreter.h +++ b/lldb/include/lldb/Interpreter/ScriptInterpreter.h @@ -21,6 +21,7 @@ #include "lldb/API/SBMemoryRegionInfo.h" #include "lldb/API/SBStream.h" #include "lldb/API/SBSymbolContext.h" +#include "lldb/API/SBThread.h" #include "lldb/Breakpoint/BreakpointOptions.h" #include "lldb/Core/PluginInterface.h" #include "lldb/Core/SearchFilter.h" @@ -580,6 +581,8 @@ class ScriptInterpreter : public PluginInterface { lldb::StreamSP GetOpaqueTypeFromSBStream(const lldb::SBStream &stream) const; + lldb::ThreadSP GetOpaqueTypeFromSBThread(const lldb::SBThread &exe_ctx) const; + lldb::StackFrameSP GetOpaqueTypeFromSBFrame(const lldb::SBFrame &frame) const; SymbolContext diff --git a/lldb/include/lldb/Target/StackFrame.h b/lldb/include/lldb/Target/StackFrame.h index cdbe8ae3c6779..af2e49b4a67da 100644 --- a/lldb/include/lldb/Target/StackFrame.h +++ b/lldb/include/lldb/Target/StackFrame.h @@ -441,8 +441,11 @@ class StackFrame : public ExecutionContextScope, /// frames are included in this frame index count. uint32_t GetFrameIndex() const; - /// Set this frame's synthetic frame index. - void SetFrameIndex(uint32_t index) { m_frame_index = index; } + /// Set this frame's frame index. + void SetFrameIndex(uint32_t index) { + m_frame_index = index; + m_concrete_frame_index = index; + } /// Query this frame to find what frame it is in this Thread's /// StackFrameList, not counting inlined frames. diff --git a/lldb/include/lldb/Target/StackFrameList.h b/lldb/include/lldb/Target/StackFrameList.h index 5b0df0ddb3e29..539c070ff0f4b 100644 --- a/lldb/include/lldb/Target/StackFrameList.h +++ b/lldb/include/lldb/Target/StackFrameList.h @@ -20,13 +20,13 @@ namespace lldb_private { class ScriptedThread; -class StackFrameList { +class StackFrameList : public std::enable_shared_from_this { public: // Constructors and Destructors StackFrameList(Thread &thread, const lldb::StackFrameListSP &prev_frames_sp, bool show_inline_frames); - ~StackFrameList(); + virtual ~StackFrameList(); /// Get the number of visible frames. Frames may be created if \p can_create /// is true. Synthetic (inline) frames expanded from the concrete frame #0 @@ -106,6 +106,7 @@ class StackFrameList { protected: friend class Thread; + friend class ScriptedFrameProvider; friend class ScriptedThread; /// Use this API to build a stack frame list (used for scripted threads, for @@ -211,19 +212,23 @@ class StackFrameList { /// Whether or not to show synthetic (inline) frames. Immutable. const bool m_show_inlined_frames; + /// Returns true if fetching frames was interrupted, false otherwise. + virtual bool FetchFramesUpTo(uint32_t end_idx, + InterruptionControl allow_interrupt); + private: uint32_t SetSelectedFrameNoLock(lldb_private::StackFrame *frame); lldb::StackFrameSP GetFrameAtIndexNoLock(uint32_t idx, std::shared_lock &guard); + /// @{ /// These two Fetch frames APIs and SynthesizeTailCallFrames are called in /// GetFramesUpTo, they are the ones that actually add frames. They must be /// called with the writer end of the list mutex held. - - /// Returns true if fetching frames was interrupted, false otherwise. - bool FetchFramesUpTo(uint32_t end_idx, InterruptionControl allow_interrupt); + /// /// Not currently interruptible so returns void. + /// }@ void FetchOnlyConcreteFramesUpTo(uint32_t end_idx); void SynthesizeTailCallFrames(StackFrame &next_frame); @@ -231,6 +236,27 @@ class StackFrameList { const StackFrameList &operator=(const StackFrameList &) = delete; }; +/// A StackFrameList that wraps another StackFrameList and uses a +/// SyntheticFrameProvider to lazily provide frames from either the provider +/// or the underlying real stack frame list. +class SyntheticStackFrameList : public StackFrameList { +public: + SyntheticStackFrameList(Thread &thread, lldb::StackFrameListSP input_frames, + const lldb::StackFrameListSP &prev_frames_sp, + bool show_inline_frames); + +protected: + /// Override FetchFramesUpTo to lazily return frames from the provider + /// or from the actual stack frame list. + bool FetchFramesUpTo(uint32_t end_idx, + InterruptionControl allow_interrupt) override; + +private: + /// The input stack frame list that the provider transforms. + /// This could be a real StackFrameList or another SyntheticStackFrameList. + lldb::StackFrameListSP m_input_frames; +}; + } // namespace lldb_private #endif // LLDB_TARGET_STACKFRAMELIST_H diff --git a/lldb/include/lldb/Target/SyntheticFrameProvider.h b/lldb/include/lldb/Target/SyntheticFrameProvider.h index 61a492f356ece..2d5330cb03105 100644 --- a/lldb/include/lldb/Target/SyntheticFrameProvider.h +++ b/lldb/include/lldb/Target/SyntheticFrameProvider.h @@ -24,22 +24,25 @@ namespace lldb_private { /// This struct contains the metadata needed to instantiate a frame provider /// and optional filters to control which threads it applies to. -struct SyntheticFrameProviderDescriptor { +struct ScriptedFrameProviderDescriptor { /// Metadata for instantiating the provider (e.g. script class name and args). lldb::ScriptedMetadataSP scripted_metadata_sp; + /// Interface for calling static methods on the provider class. + lldb::ScriptedFrameProviderInterfaceSP interface_sp; + /// Optional list of thread specifications to which this provider applies. /// If empty, the provider applies to all threads. A thread matches if it /// satisfies ANY of the specs in this vector (OR logic). std::vector thread_specs; - SyntheticFrameProviderDescriptor() = default; + ScriptedFrameProviderDescriptor() = default; - SyntheticFrameProviderDescriptor(lldb::ScriptedMetadataSP metadata_sp) + ScriptedFrameProviderDescriptor(lldb::ScriptedMetadataSP metadata_sp) : scripted_metadata_sp(metadata_sp) {} - SyntheticFrameProviderDescriptor(lldb::ScriptedMetadataSP metadata_sp, - const std::vector &specs) + ScriptedFrameProviderDescriptor(lldb::ScriptedMetadataSP metadata_sp, + const std::vector &specs) : scripted_metadata_sp(metadata_sp), thread_specs(specs) {} /// Get the name of this descriptor (the scripted class name). @@ -47,6 +50,12 @@ struct SyntheticFrameProviderDescriptor { return scripted_metadata_sp ? scripted_metadata_sp->GetClassName() : ""; } + /// Get the description of this frame provider. + /// + /// \return A string describing what this frame provider does, or an + /// empty string if no description is available. + std::string GetDescription() const; + /// Check if this descriptor applies to the given thread. bool AppliesToThread(Thread &thread) const { // If no thread specs specified, applies to all threads. @@ -64,6 +73,13 @@ struct SyntheticFrameProviderDescriptor { /// Check if this descriptor has valid metadata for script-based providers. bool IsValid() const { return scripted_metadata_sp != nullptr; } + /// Get a unique identifier for this descriptor based on its contents. + /// The ID is computed from the class name and arguments dictionary, + /// not from the pointer address, so two descriptors with the same + /// contents will have the same ID. + uint32_t GetID() const; + + /// Dump a description of this descriptor to the given stream. void Dump(Stream *s) const; }; @@ -95,7 +111,7 @@ class SyntheticFrameProvider : public PluginInterface { /// otherwise an \a llvm::Error. static llvm::Expected CreateInstance(lldb::StackFrameListSP input_frames, - const SyntheticFrameProviderDescriptor &descriptor); + const ScriptedFrameProviderDescriptor &descriptor); /// Try to create a SyntheticFrameProvider instance for the given input /// frames using a specific C++ plugin. @@ -125,6 +141,8 @@ class SyntheticFrameProvider : public PluginInterface { ~SyntheticFrameProvider() override; + virtual std::string GetDescription() const = 0; + /// Get a single stack frame at the specified index. /// /// This method is called lazily - frames are only created when requested. diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h index 40f9c9bea1c12..cbda1c835f6bd 100644 --- a/lldb/include/lldb/Target/Target.h +++ b/lldb/include/lldb/Target/Target.h @@ -32,6 +32,7 @@ #include "lldb/Target/PathMappingList.h" #include "lldb/Target/SectionLoadHistory.h" #include "lldb/Target/Statistics.h" +#include "lldb/Target/SyntheticFrameProvider.h" #include "lldb/Target/ThreadSpec.h" #include "lldb/Utility/ArchSpec.h" #include "lldb/Utility/Broadcaster.h" @@ -697,6 +698,36 @@ class Target : public std::enable_shared_from_this, Status Attach(ProcessAttachInfo &attach_info, Stream *stream); // Optional stream to receive first stop info + /// Add or update a scripted frame provider descriptor for this target. + /// All new threads in this target will check if they match any descriptors + /// to create their frame providers. + /// + /// \param[in] descriptor + /// The descriptor to add or update. + /// + /// \return + /// The descriptor identifier if the registration succeeded, otherwise an + /// llvm::Error. + llvm::Expected AddScriptedFrameProviderDescriptor( + const ScriptedFrameProviderDescriptor &descriptor); + + /// Remove a scripted frame provider descriptor by id. + /// + /// \param[in] id + /// The id of the descriptor to remove. + /// + /// \return + /// True if a descriptor was removed, false if no descriptor with that + /// id existed. + bool RemoveScriptedFrameProviderDescriptor(uint32_t id); + + /// Clear all scripted frame provider descriptors for this target. + void ClearScriptedFrameProviderDescriptors(); + + /// Get all scripted frame provider descriptors for this target. + const llvm::DenseMap & + GetScriptedFrameProviderDescriptors() const; + // This part handles the breakpoints. BreakpointList &GetBreakpointList(bool internal = false); @@ -1689,6 +1720,13 @@ class Target : public std::enable_shared_from_this, PathMappingList m_image_search_paths; TypeSystemMap m_scratch_type_system_map; + /// Map of scripted frame provider descriptors for this target. + /// Keys are the provider descriptors ids, values are the descriptors. + /// Used to initialize frame providers for new threads. + llvm::DenseMap + m_frame_provider_descriptors; + mutable std::recursive_mutex m_frame_provider_descriptors_mutex; + typedef std::map REPLMap; REPLMap m_repl_map; diff --git a/lldb/include/lldb/Target/Thread.h b/lldb/include/lldb/Target/Thread.h index 841f80cd1b1eb..46ce192556756 100644 --- a/lldb/include/lldb/Target/Thread.h +++ b/lldb/include/lldb/Target/Thread.h @@ -1297,6 +1297,15 @@ class Thread : public std::enable_shared_from_this, lldb::StackFrameListSP GetStackFrameList(); + llvm::Error + LoadScriptedFrameProvider(const ScriptedFrameProviderDescriptor &descriptor); + + void ClearScriptedFrameProvider(); + + lldb::SyntheticFrameProviderSP GetFrameProvider() const { + return m_frame_provider_sp; + } + protected: friend class ThreadPlan; friend class ThreadList; @@ -1400,6 +1409,9 @@ class Thread : public std::enable_shared_from_this, /// The Thread backed by this thread, if any. lldb::ThreadWP m_backed_thread; + /// The Scripted Frame Provider, if any. + lldb::SyntheticFrameProviderSP m_frame_provider_sp; + private: bool m_extended_info_fetched; // Have we tried to retrieve the m_extended_info // for this thread? diff --git a/lldb/include/lldb/Target/ThreadSpec.h b/lldb/include/lldb/Target/ThreadSpec.h index 7c7c832741196..63f8f8b5ec181 100644 --- a/lldb/include/lldb/Target/ThreadSpec.h +++ b/lldb/include/lldb/Target/ThreadSpec.h @@ -34,6 +34,8 @@ class ThreadSpec { public: ThreadSpec(); + ThreadSpec(Thread &thread); + static std::unique_ptr CreateFromStructuredData(const StructuredData::Dictionary &data_dict, Status &error); diff --git a/lldb/include/lldb/Utility/ScriptedMetadata.h b/lldb/include/lldb/Utility/ScriptedMetadata.h index 69c83edce909a..8523c95429718 100644 --- a/lldb/include/lldb/Utility/ScriptedMetadata.h +++ b/lldb/include/lldb/Utility/ScriptedMetadata.h @@ -10,7 +10,9 @@ #define LLDB_INTERPRETER_SCRIPTEDMETADATA_H #include "lldb/Utility/ProcessInfo.h" +#include "lldb/Utility/StreamString.h" #include "lldb/Utility/StructuredData.h" +#include "llvm/ADT/Hashing.h" namespace lldb_private { class ScriptedMetadata { @@ -27,11 +29,36 @@ class ScriptedMetadata { } } + ScriptedMetadata(const ScriptedMetadata &other) + : m_class_name(other.m_class_name), m_args_sp(other.m_args_sp) {} + explicit operator bool() const { return !m_class_name.empty(); } llvm::StringRef GetClassName() const { return m_class_name; } StructuredData::DictionarySP GetArgsSP() const { return m_args_sp; } + /// Get a unique identifier for this metadata based on its contents. + /// The ID is computed from the class name and arguments dictionary, + /// not from the pointer address, so two metadata objects with the same + /// contents will have the same ID. + uint32_t GetID() const { + if (m_class_name.empty()) + return 0; + + // Hash the class name. + llvm::hash_code hash = llvm::hash_value(m_class_name); + + // Hash the arguments dictionary if present. + if (m_args_sp) { + StreamString ss; + m_args_sp->GetDescription(ss); + hash = llvm::hash_combine(hash, llvm::hash_value(ss.GetData())); + } + + // Return the lower 32 bits of the hash. + return static_cast(hash); + } + private: std::string m_class_name; StructuredData::DictionarySP m_args_sp; diff --git a/lldb/include/lldb/lldb-private-interfaces.h b/lldb/include/lldb/lldb-private-interfaces.h index 5fc5c14c52f9e..52806eea190a7 100644 --- a/lldb/include/lldb/lldb-private-interfaces.h +++ b/lldb/include/lldb/lldb-private-interfaces.h @@ -26,7 +26,7 @@ class Value; namespace lldb_private { class ScriptedInterfaceUsages; -struct SyntheticFrameProviderDescriptor; +struct ScriptedFrameProviderDescriptor; typedef lldb::ABISP (*ABICreateInstance)(lldb::ProcessSP process_sp, const ArchSpec &arch); typedef std::unique_ptr (*ArchitectureCreateInstance)( @@ -91,7 +91,7 @@ typedef lldb::ScriptInterpreterSP (*ScriptInterpreterCreateInstance)( typedef llvm::Expected ( *ScriptedFrameProviderCreateInstance)( lldb::StackFrameListSP input_frames, - const lldb_private::SyntheticFrameProviderDescriptor &descriptor); + const lldb_private::ScriptedFrameProviderDescriptor &descriptor); typedef llvm::Expected ( *SyntheticFrameProviderCreateInstance)( lldb::StackFrameListSP input_frames, diff --git a/lldb/source/API/SBTarget.cpp b/lldb/source/API/SBTarget.cpp index 98d10aa07c53f..bb1d98b6e15c1 100644 --- a/lldb/source/API/SBTarget.cpp +++ b/lldb/source/API/SBTarget.cpp @@ -23,6 +23,7 @@ #include "lldb/API/SBStringList.h" #include "lldb/API/SBStructuredData.h" #include "lldb/API/SBSymbolContextList.h" +#include "lldb/API/SBThreadCollection.h" #include "lldb/API/SBTrace.h" #include "lldb/Breakpoint/BreakpointID.h" #include "lldb/Breakpoint/BreakpointIDList.h" @@ -39,6 +40,7 @@ #include "lldb/Core/Section.h" #include "lldb/Core/StructuredDataImpl.h" #include "lldb/Host/Host.h" +#include "lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h" #include "lldb/Symbol/DeclVendor.h" #include "lldb/Symbol/ObjectFile.h" #include "lldb/Symbol/SymbolFile.h" @@ -50,6 +52,7 @@ #include "lldb/Target/LanguageRuntime.h" #include "lldb/Target/Process.h" #include "lldb/Target/StackFrame.h" +#include "lldb/Target/SyntheticFrameProvider.h" #include "lldb/Target/Target.h" #include "lldb/Target/TargetList.h" #include "lldb/Utility/ArchSpec.h" @@ -59,6 +62,7 @@ #include "lldb/Utility/LLDBLog.h" #include "lldb/Utility/ProcessInfo.h" #include "lldb/Utility/RegularExpression.h" +#include "lldb/Utility/ScriptedMetadata.h" #include "lldb/ValueObject/ValueObjectConstResult.h" #include "lldb/ValueObject/ValueObjectList.h" #include "lldb/ValueObject/ValueObjectVariable.h" @@ -2408,3 +2412,81 @@ lldb::SBMutex SBTarget::GetAPIMutex() const { return lldb::SBMutex(target_sp); return lldb::SBMutex(); } + +uint32_t +SBTarget::RegisterScriptedFrameProvider(const char *class_name, + lldb::SBStructuredData args_dict, + lldb::SBError &error) { + LLDB_INSTRUMENT_VA(this, class_name, args_dict, error); + + TargetSP target_sp = GetSP(); + if (!target_sp) { + error.SetErrorString("invalid target"); + return 0; + } + + if (!class_name || !class_name[0]) { + error.SetErrorString("invalid class name"); + return 0; + } + + // Extract the dictionary from SBStructuredData. + StructuredData::DictionarySP dict_sp; + if (args_dict.IsValid() && args_dict.m_impl_up) { + StructuredData::ObjectSP obj_sp = args_dict.m_impl_up->GetObjectSP(); + if (obj_sp && obj_sp->GetType() != lldb::eStructuredDataTypeDictionary) { + error.SetErrorString("SBStructuredData argument isn't a dictionary"); + return 0; + } + dict_sp = std::make_shared(obj_sp); + } + + // Create the ScriptedMetadata. + ScriptedMetadataSP metadata_sp = + std::make_shared(class_name, dict_sp); + + // Create the interface for calling static methods. + ScriptedFrameProviderInterfaceSP interface_sp = + target_sp->GetDebugger() + .GetScriptInterpreter() + ->CreateScriptedFrameProviderInterface(); + + // Create a descriptor (applies to all threads by default). + ScriptedFrameProviderDescriptor descriptor(metadata_sp); + descriptor.interface_sp = interface_sp; + + llvm::Expected descriptor_id_or_err = + target_sp->AddScriptedFrameProviderDescriptor(descriptor); + if (!descriptor_id_or_err) { + error.SetErrorString( + llvm::toString(descriptor_id_or_err.takeError()).c_str()); + return 0; + } + + // Register the descriptor with the target. + return *descriptor_id_or_err; +} + +lldb::SBError SBTarget::RemoveScriptedFrameProvider(uint32_t provider_id) { + LLDB_INSTRUMENT_VA(this, provider_id); + + SBError error; + TargetSP target_sp = GetSP(); + if (!target_sp) { + error.SetErrorString("invalid target"); + return error; + } + + if (!provider_id) { + error.SetErrorString("invalid provider id"); + return error; + } + + if (!target_sp->RemoveScriptedFrameProviderDescriptor(provider_id)) { + error.SetErrorStringWithFormat("no frame provider named '%u' found", + provider_id); + return error; + } + + return {}; +} diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp index 30bca639060e6..86373f5280271 100644 --- a/lldb/source/Commands/CommandObjectTarget.cpp +++ b/lldb/source/Commands/CommandObjectTarget.cpp @@ -51,6 +51,7 @@ #include "lldb/Utility/ConstString.h" #include "lldb/Utility/FileSpec.h" #include "lldb/Utility/LLDBLog.h" +#include "lldb/Utility/ScriptedMetadata.h" #include "lldb/Utility/State.h" #include "lldb/Utility/Stream.h" #include "lldb/Utility/StructuredData.h" @@ -5401,6 +5402,202 @@ class CommandObjectTargetDump : public CommandObjectMultiword { ~CommandObjectTargetDump() override = default; }; +#pragma mark CommandObjectTargetFrameProvider + +#define LLDB_OPTIONS_target_frame_provider_register +#include "CommandOptions.inc" + +class CommandObjectTargetFrameProviderRegister : public CommandObjectParsed { +public: + CommandObjectTargetFrameProviderRegister(CommandInterpreter &interpreter) + : CommandObjectParsed( + interpreter, "target frame-provider register", + "Register frame provider for all threads in this target.", nullptr, + eCommandRequiresTarget), + + m_class_options("target frame-provider", true, 'C', 'k', 'v', 0) { + m_all_options.Append(&m_class_options, LLDB_OPT_SET_1 | LLDB_OPT_SET_2, + LLDB_OPT_SET_ALL); + m_all_options.Finalize(); + + AddSimpleArgumentList(eArgTypeRunArgs, eArgRepeatOptional); + } + + ~CommandObjectTargetFrameProviderRegister() override = default; + + Options *GetOptions() override { return &m_all_options; } + + std::optional GetRepeatCommand(Args ¤t_command_args, + uint32_t index) override { + return std::string(""); + } + +protected: + void DoExecute(Args &launch_args, CommandReturnObject &result) override { + ScriptedMetadataSP metadata_sp = std::make_shared( + m_class_options.GetName(), m_class_options.GetStructuredData()); + + Target *target = m_exe_ctx.GetTargetPtr(); + if (!target) + target = &GetDebugger().GetDummyTarget(); + + // Create the interface for calling static methods. + ScriptedFrameProviderInterfaceSP interface_sp = + GetDebugger() + .GetScriptInterpreter() + ->CreateScriptedFrameProviderInterface(); + + // Create a descriptor from the metadata (applies to all threads by + // default). + ScriptedFrameProviderDescriptor descriptor(metadata_sp); + descriptor.interface_sp = interface_sp; + + auto id_or_err = target->AddScriptedFrameProviderDescriptor(descriptor); + if (!id_or_err) { + result.SetError(id_or_err.takeError()); + return; + } + + result.AppendMessageWithFormat( + "successfully registered scripted frame provider '%s' for target\n", + m_class_options.GetName().c_str()); + } + + OptionGroupPythonClassWithDict m_class_options; + OptionGroupOptions m_all_options; +}; + +class CommandObjectTargetFrameProviderClear : public CommandObjectParsed { +public: + CommandObjectTargetFrameProviderClear(CommandInterpreter &interpreter) + : CommandObjectParsed( + interpreter, "target frame-provider clear", + "Clear all registered frame providers from this target.", nullptr, + eCommandRequiresTarget) {} + + ~CommandObjectTargetFrameProviderClear() override = default; + +protected: + void DoExecute(Args &command, CommandReturnObject &result) override { + Target *target = m_exe_ctx.GetTargetPtr(); + if (!target) { + result.AppendError("invalid target"); + return; + } + + target->ClearScriptedFrameProviderDescriptors(); + + result.SetStatus(eReturnStatusSuccessFinishResult); + } +}; + +class CommandObjectTargetFrameProviderList : public CommandObjectParsed { +public: + CommandObjectTargetFrameProviderList(CommandInterpreter &interpreter) + : CommandObjectParsed( + interpreter, "target frame-provider list", + "List all registered frame providers for the target.", nullptr, + eCommandRequiresTarget) {} + + ~CommandObjectTargetFrameProviderList() override = default; + +protected: + void DoExecute(Args &command, CommandReturnObject &result) override { + Target *target = m_exe_ctx.GetTargetPtr(); + if (!target) + target = &GetDebugger().GetDummyTarget(); + + const auto &descriptors = target->GetScriptedFrameProviderDescriptors(); + if (descriptors.empty()) { + result.AppendMessage("no frame providers registered for this target."); + result.SetStatus(eReturnStatusSuccessFinishResult); + return; + } + + result.AppendMessageWithFormat("%u frame provider(s) registered:\n\n", + descriptors.size()); + + for (const auto &entry : descriptors) { + const ScriptedFrameProviderDescriptor &descriptor = entry.second; + descriptor.Dump(&result.GetOutputStream()); + result.GetOutputStream().PutChar('\n'); + } + + result.SetStatus(eReturnStatusSuccessFinishResult); + } +}; + +class CommandObjectTargetFrameProviderRemove : public CommandObjectParsed { +public: + CommandObjectTargetFrameProviderRemove(CommandInterpreter &interpreter) + : CommandObjectParsed( + interpreter, "target frame-provider remove", + "Remove a registered frame provider from the target by id.", + "target frame-provider remove ", + eCommandRequiresTarget) { + AddSimpleArgumentList(eArgTypeUnsignedInteger, eArgRepeatPlus); + } + + ~CommandObjectTargetFrameProviderRemove() override = default; + +protected: + void DoExecute(Args &command, CommandReturnObject &result) override { + Target *target = m_exe_ctx.GetTargetPtr(); + if (!target) + target = &GetDebugger().GetDummyTarget(); + + std::vector removed_provider_ids; + for (size_t i = 0; i < command.GetArgumentCount(); i++) { + uint32_t provider_id = 0; + if (!llvm::to_integer(command[i].ref(), provider_id)) { + result.AppendError("target frame-provider remove requires integer " + "provider id argument"); + return; + } + + if (!target->RemoveScriptedFrameProviderDescriptor(provider_id)) { + result.AppendErrorWithFormat( + "no frame provider named '%u' found in target\n", provider_id); + return; + } + removed_provider_ids.push_back(provider_id); + } + + if (size_t num_removed_providers = removed_provider_ids.size()) { + result.AppendMessageWithFormat( + "Successfully removed %zu frame-providers.\n", num_removed_providers); + result.SetStatus(eReturnStatusSuccessFinishNoResult); + } else { + result.AppendError("0 frame providers removed.\n"); + } + } +}; + +class CommandObjectTargetFrameProvider : public CommandObjectMultiword { +public: + CommandObjectTargetFrameProvider(CommandInterpreter &interpreter) + : CommandObjectMultiword( + interpreter, "target frame-provider", + "Commands for registering and viewing frame providers for the " + "target.", + "target frame-provider [] ") { + LoadSubCommand("register", + CommandObjectSP(new CommandObjectTargetFrameProviderRegister( + interpreter))); + LoadSubCommand("clear", + CommandObjectSP( + new CommandObjectTargetFrameProviderClear(interpreter))); + LoadSubCommand( + "list", + CommandObjectSP(new CommandObjectTargetFrameProviderList(interpreter))); + LoadSubCommand( + "remove", CommandObjectSP( + new CommandObjectTargetFrameProviderRemove(interpreter))); + } + + ~CommandObjectTargetFrameProvider() override = default; +}; + #pragma mark CommandObjectMultiwordTarget // CommandObjectMultiwordTarget @@ -5416,6 +5613,9 @@ CommandObjectMultiwordTarget::CommandObjectMultiwordTarget( CommandObjectSP(new CommandObjectTargetDelete(interpreter))); LoadSubCommand("dump", CommandObjectSP(new CommandObjectTargetDump(interpreter))); + LoadSubCommand( + "frame-provider", + CommandObjectSP(new CommandObjectTargetFrameProvider(interpreter))); LoadSubCommand("list", CommandObjectSP(new CommandObjectTargetList(interpreter))); LoadSubCommand("select", diff --git a/lldb/source/Interpreter/ScriptInterpreter.cpp b/lldb/source/Interpreter/ScriptInterpreter.cpp index 211868b51facb..69d8607a873f3 100644 --- a/lldb/source/Interpreter/ScriptInterpreter.cpp +++ b/lldb/source/Interpreter/ScriptInterpreter.cpp @@ -106,6 +106,13 @@ ScriptInterpreter::GetStatusFromSBError(const lldb::SBError &error) const { return Status(); } +lldb::ThreadSP ScriptInterpreter::GetOpaqueTypeFromSBThread( + const lldb::SBThread &thread) const { + if (thread.m_opaque_sp) + return thread.m_opaque_sp->GetThreadSP(); + return nullptr; +} + lldb::StackFrameSP ScriptInterpreter::GetOpaqueTypeFromSBFrame(const lldb::SBFrame &frame) const { if (frame.m_opaque_sp) diff --git a/lldb/source/Plugins/CMakeLists.txt b/lldb/source/Plugins/CMakeLists.txt index 08f444e7b15e8..b6878b21ff71a 100644 --- a/lldb/source/Plugins/CMakeLists.txt +++ b/lldb/source/Plugins/CMakeLists.txt @@ -22,6 +22,7 @@ add_subdirectory(SymbolFile) add_subdirectory(SystemRuntime) add_subdirectory(SymbolLocator) add_subdirectory(SymbolVendor) +add_subdirectory(SyntheticFrameProvider) add_subdirectory(Trace) add_subdirectory(TraceExporter) add_subdirectory(TypeSystem) diff --git a/lldb/source/Plugins/Process/scripted/ScriptedFrame.cpp b/lldb/source/Plugins/Process/scripted/ScriptedFrame.cpp index 6519df9185df0..53d0c22e62ad7 100644 --- a/lldb/source/Plugins/Process/scripted/ScriptedFrame.cpp +++ b/lldb/source/Plugins/Process/scripted/ScriptedFrame.cpp @@ -7,8 +7,22 @@ //===----------------------------------------------------------------------===// #include "ScriptedFrame.h" - +#include "Plugins/Process/Utility/RegisterContextMemory.h" + +#include "lldb/Core/Address.h" +#include "lldb/Core/Debugger.h" +#include "lldb/Interpreter/Interfaces/ScriptedFrameInterface.h" +#include "lldb/Interpreter/Interfaces/ScriptedThreadInterface.h" +#include "lldb/Interpreter/ScriptInterpreter.h" +#include "lldb/Symbol/SymbolContext.h" +#include "lldb/Target/ExecutionContext.h" +#include "lldb/Target/Process.h" +#include "lldb/Target/RegisterContext.h" +#include "lldb/Target/Thread.h" #include "lldb/Utility/DataBufferHeap.h" +#include "lldb/Utility/LLDBLog.h" +#include "lldb/Utility/Log.h" +#include "lldb/Utility/StructuredData.h" using namespace lldb; using namespace lldb_private; @@ -19,30 +33,44 @@ void ScriptedFrame::CheckInterpreterAndScriptObject() const { } llvm::Expected> -ScriptedFrame::Create(ScriptedThread &thread, +ScriptedFrame::Create(ThreadSP thread_sp, + ScriptedThreadInterfaceSP scripted_thread_interface_sp, StructuredData::DictionarySP args_sp, StructuredData::Generic *script_object) { - if (!thread.IsValid()) - return llvm::createStringError("Invalid scripted thread."); + if (!thread_sp || !thread_sp->IsValid()) + return llvm::createStringError("invalid thread"); + + ProcessSP process_sp = thread_sp->GetProcess(); + if (!process_sp || !process_sp->IsValid()) + return llvm::createStringError("invalid process"); - thread.CheckInterpreterAndScriptObject(); + ScriptInterpreter *script_interp = + process_sp->GetTarget().GetDebugger().GetScriptInterpreter(); + if (!script_interp) + return llvm::createStringError("no script interpreter"); - auto scripted_frame_interface = - thread.GetInterface()->CreateScriptedFrameInterface(); + auto scripted_frame_interface = script_interp->CreateScriptedFrameInterface(); if (!scripted_frame_interface) return llvm::createStringError("failed to create scripted frame interface"); llvm::StringRef frame_class_name; if (!script_object) { - std::optional class_name = - thread.GetInterface()->GetScriptedFramePluginName(); - if (!class_name || class_name->empty()) + // If no script object is provided and we have a scripted thread interface, + // try to get the frame class name from it. + if (scripted_thread_interface_sp) { + std::optional class_name = + scripted_thread_interface_sp->GetScriptedFramePluginName(); + if (!class_name || class_name->empty()) + return llvm::createStringError( + "failed to get scripted frame class name"); + frame_class_name = *class_name; + } else { return llvm::createStringError( - "failed to get scripted thread class name"); - frame_class_name = *class_name; + "no script object provided and no scripted thread interface"); + } } - ExecutionContext exe_ctx(thread); + ExecutionContext exe_ctx(thread_sp); auto obj_or_err = scripted_frame_interface->CreatePluginObject( frame_class_name, exe_ctx, args_sp, script_object); @@ -62,7 +90,7 @@ ScriptedFrame::Create(ScriptedThread &thread, SymbolContext sc; Address symbol_addr; if (pc != LLDB_INVALID_ADDRESS) { - symbol_addr.SetLoadAddress(pc, &thread.GetProcess()->GetTarget()); + symbol_addr.SetLoadAddress(pc, &process_sp->GetTarget()); symbol_addr.CalculateSymbolContext(&sc); } @@ -77,11 +105,11 @@ ScriptedFrame::Create(ScriptedThread &thread, if (!reg_info) return llvm::createStringError( - "failed to get scripted thread registers info"); + "failed to get scripted frame registers info"); std::shared_ptr register_info_sp = - DynamicRegisterInfo::Create( - *reg_info, thread.GetProcess()->GetTarget().GetArchitecture()); + DynamicRegisterInfo::Create(*reg_info, + process_sp->GetTarget().GetArchitecture()); lldb::RegisterContextSP reg_ctx_sp; @@ -96,32 +124,35 @@ ScriptedFrame::Create(ScriptedThread &thread, std::shared_ptr reg_ctx_memory = std::make_shared( - thread, frame_id, *register_info_sp, LLDB_INVALID_ADDRESS); + *thread_sp, frame_id, *register_info_sp, LLDB_INVALID_ADDRESS); if (!reg_ctx_memory) - return llvm::createStringError("failed to create a register context."); + return llvm::createStringError("failed to create a register context"); reg_ctx_memory->SetAllRegisterData(data_sp); reg_ctx_sp = reg_ctx_memory; } return std::make_shared( - thread, scripted_frame_interface, frame_id, pc, sc, reg_ctx_sp, + thread_sp, scripted_frame_interface, frame_id, pc, sc, reg_ctx_sp, register_info_sp, owned_script_object_sp); } -ScriptedFrame::ScriptedFrame(ScriptedThread &thread, +ScriptedFrame::ScriptedFrame(ThreadSP thread_sp, ScriptedFrameInterfaceSP interface_sp, lldb::user_id_t id, lldb::addr_t pc, SymbolContext &sym_ctx, lldb::RegisterContextSP reg_ctx_sp, std::shared_ptr reg_info_sp, StructuredData::GenericSP script_object_sp) - : StackFrame(thread.shared_from_this(), /*frame_idx=*/id, + : StackFrame(thread_sp, /*frame_idx=*/id, /*concrete_frame_idx=*/id, /*reg_context_sp=*/reg_ctx_sp, /*cfa=*/0, /*pc=*/pc, /*behaves_like_zeroth_frame=*/!id, /*symbol_ctx=*/&sym_ctx), m_scripted_frame_interface_sp(interface_sp), - m_script_object_sp(script_object_sp), m_register_info_sp(reg_info_sp) {} + m_script_object_sp(script_object_sp), m_register_info_sp(reg_info_sp) { + // FIXME: This should be part of the base class constructor. + m_stack_frame_kind = StackFrame::Kind::Synthetic; +} ScriptedFrame::~ScriptedFrame() {} @@ -164,7 +195,7 @@ std::shared_ptr ScriptedFrame::GetDynamicRegisterInfo() { if (!reg_info) return ScriptedInterface::ErrorWithMessage< std::shared_ptr>( - LLVM_PRETTY_FUNCTION, "Failed to get scripted frame registers info.", + LLVM_PRETTY_FUNCTION, "failed to get scripted frame registers info", error, LLDBLog::Thread); ThreadSP thread_sp = m_thread_wp.lock(); @@ -172,7 +203,7 @@ std::shared_ptr ScriptedFrame::GetDynamicRegisterInfo() { return ScriptedInterface::ErrorWithMessage< std::shared_ptr>( LLVM_PRETTY_FUNCTION, - "Failed to get scripted frame registers info: invalid thread.", error, + "failed to get scripted frame registers info: invalid thread", error, LLDBLog::Thread); ProcessSP process_sp = thread_sp->GetProcess(); @@ -180,8 +211,8 @@ std::shared_ptr ScriptedFrame::GetDynamicRegisterInfo() { return ScriptedInterface::ErrorWithMessage< std::shared_ptr>( LLVM_PRETTY_FUNCTION, - "Failed to get scripted frame registers info: invalid process.", - error, LLDBLog::Thread); + "failed to get scripted frame registers info: invalid process", error, + LLDBLog::Thread); m_register_info_sp = DynamicRegisterInfo::Create( *reg_info, process_sp->GetTarget().GetArchitecture()); diff --git a/lldb/source/Plugins/Process/scripted/ScriptedFrame.h b/lldb/source/Plugins/Process/scripted/ScriptedFrame.h index b6b77c4a7d160..e91e6160bac2f 100644 --- a/lldb/source/Plugins/Process/scripted/ScriptedFrame.h +++ b/lldb/source/Plugins/Process/scripted/ScriptedFrame.h @@ -10,21 +10,19 @@ #define LLDB_SOURCE_PLUGINS_SCRIPTED_FRAME_H #include "ScriptedThread.h" -#include "lldb/Interpreter/ScriptInterpreter.h" #include "lldb/Target/DynamicRegisterInfo.h" #include "lldb/Target/StackFrame.h" +#include "lldb/lldb-forward.h" +#include "llvm/Support/Error.h" +#include #include -namespace lldb_private { -class ScriptedThread; -} - namespace lldb_private { class ScriptedFrame : public lldb_private::StackFrame { public: - ScriptedFrame(ScriptedThread &thread, + ScriptedFrame(lldb::ThreadSP thread_sp, lldb::ScriptedFrameInterfaceSP interface_sp, lldb::user_id_t frame_idx, lldb::addr_t pc, SymbolContext &sym_ctx, lldb::RegisterContextSP reg_ctx_sp, @@ -33,8 +31,29 @@ class ScriptedFrame : public lldb_private::StackFrame { ~ScriptedFrame() override; + /// Create a ScriptedFrame from a object instanciated in the script + /// interpreter. + /// + /// \param[in] thread_sp + /// The thread this frame belongs to. + /// + /// \param[in] scripted_thread_interface_sp + /// The scripted thread interface (needed for ScriptedThread + /// compatibility). Can be nullptr for frames on real threads. + /// + /// \param[in] args_sp + /// Arguments to pass to the frame creation. + /// + /// \param[in] script_object + /// The optional script object representing this frame. + /// + /// \return + /// An Expected containing the ScriptedFrame shared pointer if successful, + /// otherwise an error. static llvm::Expected> - Create(ScriptedThread &thread, StructuredData::DictionarySP args_sp, + Create(lldb::ThreadSP thread_sp, + lldb::ScriptedThreadInterfaceSP scripted_thread_interface_sp, + StructuredData::DictionarySP args_sp, StructuredData::Generic *script_object = nullptr); bool IsInlined() override; diff --git a/lldb/source/Plugins/Process/scripted/ScriptedThread.cpp b/lldb/source/Plugins/Process/scripted/ScriptedThread.cpp index 491efac5aadef..1dd9c48f56a59 100644 --- a/lldb/source/Plugins/Process/scripted/ScriptedThread.cpp +++ b/lldb/source/Plugins/Process/scripted/ScriptedThread.cpp @@ -210,7 +210,7 @@ bool ScriptedThread::LoadArtificialStackFrames() { SymbolContext sc; symbol_addr.CalculateSymbolContext(&sc); - return std::make_shared(this->shared_from_this(), idx, idx, cfa, + return std::make_shared(shared_from_this(), idx, idx, cfa, cfa_is_valid, pc, StackFrame::Kind::Synthetic, artificial, behaves_like_zeroth_frame, &sc); @@ -231,8 +231,8 @@ bool ScriptedThread::LoadArtificialStackFrames() { return error.ToError(); } - auto frame_or_error = - ScriptedFrame::Create(*this, nullptr, object_sp->GetAsGeneric()); + auto frame_or_error = ScriptedFrame::Create( + shared_from_this(), GetInterface(), nullptr, object_sp->GetAsGeneric()); if (!frame_or_error) { ScriptedInterface::ErrorWithMessage( diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptInterpreterPythonInterfaces.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptInterpreterPythonInterfaces.cpp index d43036d6fe544..f6c707b2bd168 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptInterpreterPythonInterfaces.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptInterpreterPythonInterfaces.cpp @@ -31,6 +31,7 @@ void ScriptInterpreterPythonInterfaces::Initialize() { ScriptedStopHookPythonInterface::Initialize(); ScriptedBreakpointPythonInterface::Initialize(); ScriptedThreadPlanPythonInterface::Initialize(); + ScriptedFrameProviderPythonInterface::Initialize(); } void ScriptInterpreterPythonInterfaces::Terminate() { @@ -40,6 +41,7 @@ void ScriptInterpreterPythonInterfaces::Terminate() { ScriptedStopHookPythonInterface::Terminate(); ScriptedBreakpointPythonInterface::Terminate(); ScriptedThreadPlanPythonInterface::Terminate(); + ScriptedFrameProviderPythonInterface::Terminate(); } #endif diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp index b866bf332b7b6..3dde5036453f4 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "lldb/Core/PluginManager.h" #include "lldb/Host/Config.h" #include "lldb/Target/Thread.h" #include "lldb/Utility/Log.h" @@ -30,18 +31,45 @@ ScriptedFrameProviderPythonInterface::ScriptedFrameProviderPythonInterface( ScriptInterpreterPythonImpl &interpreter) : ScriptedFrameProviderInterface(), ScriptedPythonInterface(interpreter) {} +bool ScriptedFrameProviderPythonInterface::AppliesToThread( + llvm::StringRef class_name, lldb::ThreadSP thread_sp) { + // If there is any issue with this method, we will just assume it also applies + // to this thread which is the default behavior. + constexpr bool fail_value = true; + Status error; + StructuredData::ObjectSP obj = + CallStaticMethod(class_name, "applies_to_thread", error, thread_sp); + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, + error)) + return fail_value; + + return obj->GetBooleanValue(fail_value); +} + llvm::Expected ScriptedFrameProviderPythonInterface::CreatePluginObject( const llvm::StringRef class_name, lldb::StackFrameListSP input_frames, StructuredData::DictionarySP args_sp) { if (!input_frames) - return llvm::createStringError("Invalid frame list"); + return llvm::createStringError("invalid frame list"); StructuredDataImpl sd_impl(args_sp); return ScriptedPythonInterface::CreatePluginObject(class_name, nullptr, input_frames, sd_impl); } +std::string ScriptedFrameProviderPythonInterface::GetDescription( + llvm::StringRef class_name) { + Status error; + StructuredData::ObjectSP obj = + CallStaticMethod(class_name, "get_description", error); + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, + error)) + return {}; + + return obj->GetStringValue().str(); +} + StructuredData::ObjectSP ScriptedFrameProviderPythonInterface::GetFrameAtIndex(uint32_t index) { Status error; @@ -54,4 +82,32 @@ ScriptedFrameProviderPythonInterface::GetFrameAtIndex(uint32_t index) { return obj; } +bool ScriptedFrameProviderPythonInterface::CreateInstance( + lldb::ScriptLanguage language, ScriptedInterfaceUsages usages) { + if (language != eScriptLanguagePython) + return false; + + return true; +} + +void ScriptedFrameProviderPythonInterface::Initialize() { + const std::vector ci_usages = { + "target frame-provider register -C [-k key -v value ...]", + "target frame-provider list", + "target frame-provider remove ", + "target frame-provider clear"}; + const std::vector api_usages = { + "SBTarget.RegisterScriptedFrameProvider", + "SBTarget.RemoveScriptedFrameProvider", + "SBTarget.ClearScriptedFrameProvider"}; + PluginManager::RegisterPlugin( + GetPluginNameStatic(), + llvm::StringRef("Provide scripted stack frames for threads"), + CreateInstance, eScriptLanguagePython, {ci_usages, api_usages}); +} + +void ScriptedFrameProviderPythonInterface::Terminate() { + PluginManager::UnregisterPlugin(CreateInstance); +} + #endif diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.h index fd163984028d3..97a5cc7c669ea 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.h @@ -14,17 +14,22 @@ #if LLDB_ENABLE_PYTHON #include "ScriptedPythonInterface.h" +#include "lldb/Core/PluginInterface.h" #include "lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h" #include namespace lldb_private { class ScriptedFrameProviderPythonInterface : public ScriptedFrameProviderInterface, - public ScriptedPythonInterface { + public ScriptedPythonInterface, + public PluginInterface { public: ScriptedFrameProviderPythonInterface( ScriptInterpreterPythonImpl &interpreter); + bool AppliesToThread(llvm::StringRef class_name, + lldb::ThreadSP thread_sp) override; + llvm::Expected CreatePluginObject(llvm::StringRef class_name, lldb::StackFrameListSP input_frames, @@ -33,10 +38,24 @@ class ScriptedFrameProviderPythonInterface llvm::SmallVector GetAbstractMethodRequirements() const override { return llvm::SmallVector( - {{"get_frame_at_index"}}); + {{"get_description"}, {"get_frame_at_index"}}); } + std::string GetDescription(llvm::StringRef class_name) override; + StructuredData::ObjectSP GetFrameAtIndex(uint32_t index) override; + + static void Initialize(); + static void Terminate(); + + static bool CreateInstance(lldb::ScriptLanguage language, + ScriptedInterfaceUsages usages); + + static llvm::StringRef GetPluginNameStatic() { + return "ScriptedFrameProviderPythonInterface"; + } + + llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); } }; } // namespace lldb_private diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp index af2e0b5df4d22..ba4473cf9ec4d 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp @@ -93,6 +93,19 @@ ScriptedPythonInterface::ExtractValueFromPythonObject( return nullptr; } +template <> +lldb::ThreadSP +ScriptedPythonInterface::ExtractValueFromPythonObject( + python::PythonObject &p, Status &error) { + if (lldb::SBThread *sb_thread = reinterpret_cast( + python::LLDBSWIGPython_CastPyObjectToSBThread(p.get()))) + return m_interpreter.GetOpaqueTypeFromSBThread(*sb_thread); + error = Status::FromErrorString( + "Couldn't cast lldb::SBThread to lldb_private::Thread."); + + return nullptr; +} + template <> SymbolContext ScriptedPythonInterface::ExtractValueFromPythonObject( diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h index af88a69e34a13..c460f58b4e721 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h @@ -330,6 +330,112 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { return m_object_instance_sp; } + /// Call a static method on a Python class without creating an instance. + /// + /// This method resolves a Python class by name and calls a static method + /// on it, returning the result. This is useful for calling class-level + /// methods that don't require an instance. + /// + /// \param class_name The fully-qualified name of the Python class. + /// \param method_name The name of the static method to call. + /// \param error Output parameter to receive error information if the call + /// fails. + /// \param args Arguments to pass to the static method. + /// + /// \return The return value of the static method call, or an error value. + template + T CallStaticMethod(llvm::StringRef class_name, llvm::StringRef method_name, + Status &error, Args &&...args) { + using namespace python; + using Locker = ScriptInterpreterPythonImpl::Locker; + + std::string caller_signature = + llvm::Twine(LLVM_PRETTY_FUNCTION + llvm::Twine(" (") + + llvm::Twine(class_name) + llvm::Twine(".") + + llvm::Twine(method_name) + llvm::Twine(")")) + .str(); + + if (class_name.empty()) + return ErrorWithMessage(caller_signature, "missing script class name", + error); + + Locker py_lock(&m_interpreter, Locker::AcquireLock | Locker::NoSTDIN, + Locker::FreeLock); + + // Get the interpreter dictionary. + auto dict = + PythonModule::MainModule().ResolveName( + m_interpreter.GetDictionaryName()); + if (!dict.IsAllocated()) + return ErrorWithMessage( + caller_signature, + llvm::formatv("could not find interpreter dictionary: {0}", + m_interpreter.GetDictionaryName()) + .str(), + error); + + // Resolve the class. + auto class_obj = + PythonObject::ResolveNameWithDictionary( + class_name, dict); + if (!class_obj.IsAllocated()) + return ErrorWithMessage( + caller_signature, + llvm::formatv("could not find script class: {0}", class_name).str(), + error); + + // Get the static method from the class. + if (!class_obj.HasAttribute(method_name)) + return ErrorWithMessage( + caller_signature, + llvm::formatv("class {0} does not have method {1}", class_name, + method_name) + .str(), + error); + + PythonCallable method = + class_obj.GetAttributeValue(method_name).AsType(); + if (!method.IsAllocated()) + return ErrorWithMessage(caller_signature, + llvm::formatv("method {0}.{1} is not callable", + class_name, method_name) + .str(), + error); + + // Transform the arguments. + std::tuple original_args = std::forward_as_tuple(args...); + auto transformed_args = TransformArgs(original_args); + + // Call the static method. + llvm::Expected expected_return_object = + llvm::make_error("Not initialized.", + llvm::inconvertibleErrorCode()); + std::apply( + [&method, &expected_return_object](auto &&...args) { + llvm::consumeError(expected_return_object.takeError()); + expected_return_object = method(args...); + }, + transformed_args); + + if (llvm::Error e = expected_return_object.takeError()) { + error = Status::FromError(std::move(e)); + return ErrorWithMessage( + caller_signature, "python static method could not be called", error); + } + + PythonObject py_return = std::move(expected_return_object.get()); + + // Re-assign reference and pointer arguments if needed. + if (sizeof...(Args) > 0) + if (!ReassignPtrsOrRefsArgs(original_args, transformed_args)) + return ErrorWithMessage( + caller_signature, + "couldn't re-assign reference and pointer arguments", error); + + // Extract value from Python object (handles unallocated case). + return ExtractValueFromPythonObject(py_return, error); + } + protected: template T ExtractValueFromPythonObject(python::PythonObject &p, Status &error) { @@ -346,7 +452,7 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { llvm::Twine(method_name) + llvm::Twine(")")) .str(); if (!m_object_instance_sp) - return ErrorWithMessage(caller_signature, "Python object ill-formed", + return ErrorWithMessage(caller_signature, "python object ill-formed", error); Locker py_lock(&m_interpreter, Locker::AcquireLock | Locker::NoSTDIN, @@ -358,7 +464,7 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { if (!implementor.IsAllocated()) return llvm::is_contained(GetAbstractMethods(), method_name) ? ErrorWithMessage(caller_signature, - "Python implementor not allocated.", + "python implementor not allocated", error) : T{}; @@ -379,20 +485,20 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { if (llvm::Error e = expected_return_object.takeError()) { error = Status::FromError(std::move(e)); return ErrorWithMessage(caller_signature, - "Python method could not be called.", error); + "python method could not be called", error); } PythonObject py_return = std::move(expected_return_object.get()); // Now that we called the python method with the transformed arguments, - // we need to interate again over both the original and transformed + // we need to iterate again over both the original and transformed // parameter pack, and transform back the parameter that were passed in // the original parameter pack as references or pointers. if (sizeof...(Args) > 0) if (!ReassignPtrsOrRefsArgs(original_args, transformed_args)) return ErrorWithMessage( caller_signature, - "Couldn't re-assign reference and pointer arguments.", error); + "couldn't re-assign reference and pointer arguments", error); if (!py_return.IsAllocated()) return {}; @@ -598,6 +704,11 @@ lldb::StreamSP ScriptedPythonInterface::ExtractValueFromPythonObject( python::PythonObject &p, Status &error); +template <> +lldb::ThreadSP +ScriptedPythonInterface::ExtractValueFromPythonObject( + python::PythonObject &p, Status &error); + template <> lldb::StackFrameSP ScriptedPythonInterface::ExtractValueFromPythonObject( diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h index 2c971262fc34e..32948ffd30023 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h @@ -265,6 +265,7 @@ void *LLDBSWIGPython_CastPyObjectToSBLaunchInfo(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBError(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBEvent(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBStream(PyObject *data); +void *LLDBSWIGPython_CastPyObjectToSBThread(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBFrame(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBSymbolContext(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBValue(PyObject *data); diff --git a/lldb/source/Plugins/SyntheticFrameProvider/CMakeLists.txt b/lldb/source/Plugins/SyntheticFrameProvider/CMakeLists.txt new file mode 100644 index 0000000000000..85b405e648c1f --- /dev/null +++ b/lldb/source/Plugins/SyntheticFrameProvider/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(ScriptedFrameProvider) diff --git a/lldb/source/Plugins/SyntheticFrameProvider/ScriptedFrameProvider/CMakeLists.txt b/lldb/source/Plugins/SyntheticFrameProvider/ScriptedFrameProvider/CMakeLists.txt new file mode 100644 index 0000000000000..fe67d39efdf11 --- /dev/null +++ b/lldb/source/Plugins/SyntheticFrameProvider/ScriptedFrameProvider/CMakeLists.txt @@ -0,0 +1,12 @@ +add_lldb_library(lldbPluginScriptedFrameProvider PLUGIN + ScriptedFrameProvider.cpp + + LINK_COMPONENTS + Support + + LINK_LIBS + lldbCore + lldbInterpreter + lldbTarget + lldbUtility + ) diff --git a/lldb/source/Plugins/SyntheticFrameProvider/ScriptedFrameProvider/ScriptedFrameProvider.cpp b/lldb/source/Plugins/SyntheticFrameProvider/ScriptedFrameProvider/ScriptedFrameProvider.cpp new file mode 100644 index 0000000000000..17d0e925fadc6 --- /dev/null +++ b/lldb/source/Plugins/SyntheticFrameProvider/ScriptedFrameProvider/ScriptedFrameProvider.cpp @@ -0,0 +1,215 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ScriptedFrameProvider.h" +#include "Plugins/Process/scripted/ScriptedFrame.h" +#include "lldb/Core/Debugger.h" +#include "lldb/Core/PluginManager.h" +#include "lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h" +#include "lldb/Interpreter/ScriptInterpreter.h" +#include "lldb/Target/Process.h" +#include "lldb/Target/StackFrame.h" +#include "lldb/Target/Thread.h" +#include "lldb/Utility/ScriptedMetadata.h" +#include "lldb/Utility/Status.h" +#include "llvm/Support/Error.h" +#include + +using namespace lldb; +using namespace lldb_private; + +void ScriptedFrameProvider::Initialize() { + PluginManager::RegisterPlugin(GetPluginNameStatic(), + "Provides synthetic frames via scripting", + nullptr, ScriptedFrameProvider::CreateInstance); +} + +void ScriptedFrameProvider::Terminate() { + PluginManager::UnregisterPlugin(ScriptedFrameProvider::CreateInstance); +} + +llvm::Expected +ScriptedFrameProvider::CreateInstance( + lldb::StackFrameListSP input_frames, + const ScriptedFrameProviderDescriptor &descriptor) { + if (!input_frames) + return llvm::createStringError( + "failed to create scripted frame provider: invalid input frames"); + + Thread &thread = input_frames->GetThread(); + ProcessSP process_sp = thread.GetProcess(); + if (!process_sp) + return nullptr; + + if (!descriptor.IsValid()) + return llvm::createStringError( + "failed to create scripted frame provider: invalid scripted metadata"); + + if (!descriptor.AppliesToThread(thread)) + return nullptr; + + ScriptInterpreter *script_interp = + process_sp->GetTarget().GetDebugger().GetScriptInterpreter(); + if (!script_interp) + return llvm::createStringError("cannot create scripted frame provider: No " + "script interpreter installed"); + + ScriptedFrameProviderInterfaceSP interface_sp = + script_interp->CreateScriptedFrameProviderInterface(); + if (!interface_sp) + return llvm::createStringError( + "cannot create scripted frame provider: script interpreter couldn't " + "create Scripted Frame Provider Interface"); + + const ScriptedMetadataSP scripted_metadata = descriptor.scripted_metadata_sp; + + // If we shouldn't attach a frame provider to this thread, just exit early. + if (!interface_sp->AppliesToThread(scripted_metadata->GetClassName(), + thread.shared_from_this())) + return nullptr; + + auto obj_or_err = interface_sp->CreatePluginObject( + scripted_metadata->GetClassName(), input_frames, + scripted_metadata->GetArgsSP()); + if (!obj_or_err) + return obj_or_err.takeError(); + + StructuredData::ObjectSP object_sp = *obj_or_err; + if (!object_sp || !object_sp->IsValid()) + return llvm::createStringError( + "cannot create scripted frame provider: failed to create valid scripted" + "frame provider object"); + + return std::make_shared(input_frames, interface_sp, + descriptor); +} + +ScriptedFrameProvider::ScriptedFrameProvider( + StackFrameListSP input_frames, + lldb::ScriptedFrameProviderInterfaceSP interface_sp, + const ScriptedFrameProviderDescriptor &descriptor) + : SyntheticFrameProvider(input_frames), m_interface_sp(interface_sp), + m_descriptor(descriptor) {} + +ScriptedFrameProvider::~ScriptedFrameProvider() = default; + +std::string ScriptedFrameProvider::GetDescription() const { + if (!m_interface_sp) + return {}; + + return m_interface_sp->GetDescription(m_descriptor.GetName()); +} + +llvm::Expected +ScriptedFrameProvider::GetFrameAtIndex(uint32_t idx) { + if (!m_interface_sp) + return llvm::createStringError( + "cannot get stack frame: scripted frame provider not initialized"); + + auto create_frame_from_dict = + [this](StructuredData::Dictionary *dict, + uint32_t index) -> llvm::Expected { + lldb::addr_t pc; + if (!dict->GetValueForKeyAsInteger("pc", pc)) + return llvm::createStringError( + "missing 'pc' key from scripted frame dictionary"); + + Address symbol_addr; + symbol_addr.SetLoadAddress(pc, &GetThread().GetProcess()->GetTarget()); + + const lldb::addr_t cfa = LLDB_INVALID_ADDRESS; + const bool cfa_is_valid = false; + const bool artificial = false; + const bool behaves_like_zeroth_frame = false; + SymbolContext sc; + symbol_addr.CalculateSymbolContext(&sc); + + ThreadSP thread_sp = GetThread().shared_from_this(); + return std::make_shared(thread_sp, index, index, cfa, + cfa_is_valid, pc, + StackFrame::Kind::Synthetic, artificial, + behaves_like_zeroth_frame, &sc); + }; + + auto create_frame_from_script_object = + [this]( + StructuredData::ObjectSP object_sp) -> llvm::Expected { + Status error; + if (!object_sp || !object_sp->GetAsGeneric()) + return llvm::createStringError("invalid script object"); + + ThreadSP thread_sp = GetThread().shared_from_this(); + auto frame_or_error = ScriptedFrame::Create(thread_sp, nullptr, nullptr, + object_sp->GetAsGeneric()); + + if (!frame_or_error) { + ScriptedInterface::ErrorWithMessage( + LLVM_PRETTY_FUNCTION, toString(frame_or_error.takeError()), error); + return error.ToError(); + } + + return *frame_or_error; + }; + + StructuredData::ObjectSP obj_sp = m_interface_sp->GetFrameAtIndex(idx); + + // None/null means no more frames or error. + if (!obj_sp || !obj_sp->IsValid()) + return llvm::createStringError("invalid script object returned for frame " + + llvm::Twine(idx)); + + StackFrameSP synth_frame_sp = nullptr; + if (StructuredData::UnsignedInteger *int_obj = + obj_sp->GetAsUnsignedInteger()) { + uint32_t real_frame_index = int_obj->GetValue(); + if (real_frame_index < m_input_frames->GetNumFrames()) { + synth_frame_sp = m_input_frames->GetFrameAtIndex(real_frame_index); + } + } else if (StructuredData::Dictionary *dict = obj_sp->GetAsDictionary()) { + // Check if it's a dictionary describing a frame. + auto frame_from_dict_or_err = create_frame_from_dict(dict, idx); + if (!frame_from_dict_or_err) { + return llvm::createStringError(llvm::Twine( + "couldn't create frame from dictionary at index " + llvm::Twine(idx) + + ": " + toString(frame_from_dict_or_err.takeError()))); + } + synth_frame_sp = *frame_from_dict_or_err; + } else if (obj_sp->GetAsGeneric()) { + // It's a ScriptedFrame object. + auto frame_from_script_obj_or_err = create_frame_from_script_object(obj_sp); + if (!frame_from_script_obj_or_err) { + return llvm::createStringError( + llvm::Twine("couldn't create frame from script object at index " + + llvm::Twine(idx) + ": " + + toString(frame_from_script_obj_or_err.takeError()))); + } + synth_frame_sp = *frame_from_script_obj_or_err; + } else { + return llvm::createStringError( + llvm::Twine("invalid return type from get_frame_at_index at index " + + llvm::Twine(idx))); + } + + if (!synth_frame_sp) + return llvm::createStringError( + llvm::Twine("failed to create frame at index " + llvm::Twine(idx))); + + synth_frame_sp->SetFrameIndex(idx); + + return synth_frame_sp; +} + +namespace lldb_private { +void lldb_initialize_ScriptedFrameProvider() { + ScriptedFrameProvider::Initialize(); +} + +void lldb_terminate_ScriptedFrameProvider() { + ScriptedFrameProvider::Terminate(); +} +} // namespace lldb_private diff --git a/lldb/source/Plugins/SyntheticFrameProvider/ScriptedFrameProvider/ScriptedFrameProvider.h b/lldb/source/Plugins/SyntheticFrameProvider/ScriptedFrameProvider/ScriptedFrameProvider.h new file mode 100644 index 0000000000000..3434bf26ade24 --- /dev/null +++ b/lldb/source/Plugins/SyntheticFrameProvider/ScriptedFrameProvider/ScriptedFrameProvider.h @@ -0,0 +1,53 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_PLUGINS_SYNTHETICFRAMEPROVIDER_SCRIPTEDFRAMEPROVIDER_SCRIPTEDFRAMEPROVIDER_H +#define LLDB_PLUGINS_SYNTHETICFRAMEPROVIDER_SCRIPTEDFRAMEPROVIDER_SCRIPTEDFRAMEPROVIDER_H + +#include "lldb/Target/SyntheticFrameProvider.h" +#include "lldb/Utility/ScriptedMetadata.h" +#include "lldb/Utility/Status.h" +#include "lldb/lldb-forward.h" +#include "llvm/Support/Error.h" + +namespace lldb_private { + +class ScriptedFrameProvider : public SyntheticFrameProvider { +public: + static llvm::StringRef GetPluginNameStatic() { + return "ScriptedFrameProvider"; + } + + static llvm::Expected + CreateInstance(lldb::StackFrameListSP input_frames, + const ScriptedFrameProviderDescriptor &descriptor); + + static void Initialize(); + + static void Terminate(); + + ScriptedFrameProvider(lldb::StackFrameListSP input_frames, + lldb::ScriptedFrameProviderInterfaceSP interface_sp, + const ScriptedFrameProviderDescriptor &descriptor); + ~ScriptedFrameProvider() override; + + llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); } + + std::string GetDescription() const override; + + /// Get a single stack frame at the specified index. + llvm::Expected GetFrameAtIndex(uint32_t idx) override; + +private: + lldb::ScriptedFrameProviderInterfaceSP m_interface_sp; + const ScriptedFrameProviderDescriptor &m_descriptor; +}; + +} // namespace lldb_private + +#endif // LLDB_PLUGINS_SYNTHETICFRAMEPROVIDER_SCRIPTEDFRAMEPROVIDER_SCRIPTEDFRAMEPROVIDER_H diff --git a/lldb/source/Target/StackFrameList.cpp b/lldb/source/Target/StackFrameList.cpp index ccf874fc03ebd..a661500ec862b 100644 --- a/lldb/source/Target/StackFrameList.cpp +++ b/lldb/source/Target/StackFrameList.cpp @@ -20,6 +20,7 @@ #include "lldb/Target/StackFrame.h" #include "lldb/Target/StackFrameRecognizer.h" #include "lldb/Target/StopInfo.h" +#include "lldb/Target/SyntheticFrameProvider.h" #include "lldb/Target/Target.h" #include "lldb/Target/Thread.h" #include "lldb/Target/Unwind.h" @@ -55,6 +56,40 @@ StackFrameList::~StackFrameList() { Clear(); } +SyntheticStackFrameList::SyntheticStackFrameList( + Thread &thread, lldb::StackFrameListSP input_frames, + const lldb::StackFrameListSP &prev_frames_sp, bool show_inline_frames) + : StackFrameList(thread, prev_frames_sp, show_inline_frames), + m_input_frames(std::move(input_frames)) {} + +bool SyntheticStackFrameList::FetchFramesUpTo( + uint32_t end_idx, InterruptionControl allow_interrupt) { + // Check if the thread has a synthetic frame provider. + if (auto provider_sp = m_thread.GetFrameProvider()) { + // Use the synthetic frame provider to generate frames lazily. + // Keep fetching until we reach end_idx or the provider returns an error. + for (uint32_t idx = m_frames.size(); idx <= end_idx; idx++) { + if (allow_interrupt && + m_thread.GetProcess()->GetTarget().GetDebugger().InterruptRequested()) + return true; + auto frame_or_err = provider_sp->GetFrameAtIndex(idx); + if (!frame_or_err) { + // Provider returned error - we've reached the end. + LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), frame_or_err.takeError(), + "Frame provider reached end at index {0}: {1}", idx); + SetAllFramesFetched(); + break; + } + m_frames.push_back(*frame_or_err); + } + + return false; // Not interrupted. + } + + // If no provider, fall back to the base implementation. + return StackFrameList::FetchFramesUpTo(end_idx, allow_interrupt); +} + void StackFrameList::CalculateCurrentInlinedDepth() { uint32_t cur_inlined_depth = GetCurrentInlinedDepth(); if (cur_inlined_depth == UINT32_MAX) { diff --git a/lldb/source/Target/SyntheticFrameProvider.cpp b/lldb/source/Target/SyntheticFrameProvider.cpp index 241ce82c39be3..97ff42d1ed53e 100644 --- a/lldb/source/Target/SyntheticFrameProvider.cpp +++ b/lldb/source/Target/SyntheticFrameProvider.cpp @@ -8,10 +8,12 @@ #include "lldb/Target/SyntheticFrameProvider.h" #include "lldb/Core/PluginManager.h" +#include "lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h" #include "lldb/Target/Thread.h" #include "lldb/Utility/LLDBLog.h" #include "lldb/Utility/Log.h" #include "lldb/Utility/Status.h" +#include "lldb/Utility/Stream.h" using namespace lldb; using namespace lldb_private; @@ -21,12 +23,17 @@ SyntheticFrameProvider::SyntheticFrameProvider(StackFrameListSP input_frames) SyntheticFrameProvider::~SyntheticFrameProvider() = default; -void SyntheticFrameProviderDescriptor::Dump(Stream *s) const { +void ScriptedFrameProviderDescriptor::Dump(Stream *s) const { if (!s) return; + s->Format(" ID: {0:x}\n", GetID()); s->Printf(" Name: %s\n", GetName().str().c_str()); + std::string description = GetDescription(); + if (!description.empty()) + s->Printf(" Description: %s\n", description.c_str()); + // Show thread filter information. if (thread_specs.empty()) { s->PutCString(" Thread Filter: (applies to all threads)\n"); @@ -41,9 +48,23 @@ void SyntheticFrameProviderDescriptor::Dump(Stream *s) const { } } +uint32_t ScriptedFrameProviderDescriptor::GetID() const { + if (!scripted_metadata_sp) + return 0; + + return scripted_metadata_sp->GetID(); +} + +std::string ScriptedFrameProviderDescriptor::GetDescription() const { + // If we have an interface, call get_description() to fetch it. + if (interface_sp && scripted_metadata_sp) + return interface_sp->GetDescription(scripted_metadata_sp->GetClassName()); + return {}; +} + llvm::Expected SyntheticFrameProvider::CreateInstance( StackFrameListSP input_frames, - const SyntheticFrameProviderDescriptor &descriptor) { + const ScriptedFrameProviderDescriptor &descriptor) { if (!input_frames) return llvm::createStringError( "cannot create synthetic frame provider: invalid input frames"); diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp index 3b51e17d1c4e0..3f182bc61392b 100644 --- a/lldb/source/Target/Target.cpp +++ b/lldb/source/Target/Target.cpp @@ -3718,6 +3718,61 @@ Status Target::Attach(ProcessAttachInfo &attach_info, Stream *stream) { return error; } +llvm::Expected Target::AddScriptedFrameProviderDescriptor( + const ScriptedFrameProviderDescriptor &descriptor) { + if (!descriptor.IsValid()) + return llvm::createStringError("invalid frame provider descriptor"); + + llvm::StringRef name = descriptor.GetName(); + if (name.empty()) + return llvm::createStringError( + "frame provider descriptor has no class name"); + + std::lock_guard guard( + m_frame_provider_descriptors_mutex); + + uint32_t descriptor_id = descriptor.GetID(); + m_frame_provider_descriptors[descriptor_id] = descriptor; + + // Clear frame providers on existing threads so they reload with new config. + if (ProcessSP process_sp = GetProcessSP()) + for (ThreadSP thread_sp : process_sp->Threads()) + thread_sp->ClearScriptedFrameProvider(); + + return descriptor_id; +} + +bool Target::RemoveScriptedFrameProviderDescriptor(uint32_t id) { + std::lock_guard guard( + m_frame_provider_descriptors_mutex); + bool removed = m_frame_provider_descriptors.erase(id); + + if (removed) + if (ProcessSP process_sp = GetProcessSP()) + for (ThreadSP thread_sp : process_sp->Threads()) + thread_sp->ClearScriptedFrameProvider(); + + return removed; +} + +void Target::ClearScriptedFrameProviderDescriptors() { + std::lock_guard guard( + m_frame_provider_descriptors_mutex); + + m_frame_provider_descriptors.clear(); + + if (ProcessSP process_sp = GetProcessSP()) + for (ThreadSP thread_sp : process_sp->Threads()) + thread_sp->ClearScriptedFrameProvider(); +} + +const llvm::DenseMap & +Target::GetScriptedFrameProviderDescriptors() const { + std::lock_guard guard( + m_frame_provider_descriptors_mutex); + return m_frame_provider_descriptors; +} + void Target::FinalizeFileActions(ProcessLaunchInfo &info) { Log *log = GetLog(LLDBLog::Process); diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp index 8c3e19725f8cb..b40e753aca1e9 100644 --- a/lldb/source/Target/Thread.cpp +++ b/lldb/source/Target/Thread.cpp @@ -13,9 +13,12 @@ #include "lldb/Core/Module.h" #include "lldb/Core/StructuredDataImpl.h" #include "lldb/Host/Host.h" +#include "lldb/Interpreter/Interfaces/ScriptedFrameInterface.h" +#include "lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h" #include "lldb/Interpreter/OptionValueFileSpecList.h" #include "lldb/Interpreter/OptionValueProperties.h" #include "lldb/Interpreter/Property.h" +#include "lldb/Interpreter/ScriptInterpreter.h" #include "lldb/Symbol/Function.h" #include "lldb/Target/ABI.h" #include "lldb/Target/DynamicLoader.h" @@ -26,6 +29,7 @@ #include "lldb/Target/ScriptedThreadPlan.h" #include "lldb/Target/StackFrameRecognizer.h" #include "lldb/Target/StopInfo.h" +#include "lldb/Target/SyntheticFrameProvider.h" #include "lldb/Target/SystemRuntime.h" #include "lldb/Target/Target.h" #include "lldb/Target/ThreadPlan.h" @@ -45,6 +49,7 @@ #include "lldb/Utility/LLDBLog.h" #include "lldb/Utility/Log.h" #include "lldb/Utility/RegularExpression.h" +#include "lldb/Utility/ScriptedMetadata.h" #include "lldb/Utility/State.h" #include "lldb/Utility/Stream.h" #include "lldb/Utility/StreamString.h" @@ -257,6 +262,7 @@ void Thread::DestroyThread() { std::lock_guard guard(m_frame_mutex); m_curr_frames_sp.reset(); m_prev_frames_sp.reset(); + m_frame_provider_sp.reset(); m_prev_framezero_pc.reset(); } @@ -1439,13 +1445,76 @@ void Thread::CalculateExecutionContext(ExecutionContext &exe_ctx) { StackFrameListSP Thread::GetStackFrameList() { std::lock_guard guard(m_frame_mutex); - if (!m_curr_frames_sp) + if (m_curr_frames_sp) + return m_curr_frames_sp; + + // First, try to load a frame provider if we don't have one yet. + if (!m_frame_provider_sp) { + ProcessSP process_sp = GetProcess(); + if (process_sp) { + Target &target = process_sp->GetTarget(); + const auto &descriptors = target.GetScriptedFrameProviderDescriptors(); + + // Find first descriptor that applies to this thread. + for (const auto &entry : descriptors) { + const ScriptedFrameProviderDescriptor &descriptor = entry.second; + if (descriptor.IsValid() && descriptor.AppliesToThread(*this)) { + if (llvm::Error error = LoadScriptedFrameProvider(descriptor)) { + LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), std::move(error), + "Failed to load scripted frame provider: {0}"); + } + break; // Use first matching descriptor (success or failure). + } + } + } + } + + // Create the frame list based on whether we have a provider. + if (m_frame_provider_sp) { + // We have a provider - create synthetic frame list. + StackFrameListSP input_frames = m_frame_provider_sp->GetInputFrames(); + m_curr_frames_sp = std::make_shared( + *this, input_frames, m_prev_frames_sp, true); + } else { + // No provider - use normal unwinder frames. m_curr_frames_sp = std::make_shared(*this, m_prev_frames_sp, true); + } return m_curr_frames_sp; } +llvm::Error Thread::LoadScriptedFrameProvider( + const ScriptedFrameProviderDescriptor &descriptor) { + std::lock_guard guard(m_frame_mutex); + + // Note: We don't create input_frames here - it will be created lazily + // by SyntheticStackFrameList when frames are first fetched. + // Creating them too early can cause crashes during thread initialization. + + // Create a temporary StackFrameList just to get the thread reference for the + // provider. The provider won't actually use this - it will get real input + // frames from SyntheticStackFrameList later. + StackFrameListSP temp_frames = + std::make_shared(*this, m_prev_frames_sp, true); + + auto provider_or_err = + SyntheticFrameProvider::CreateInstance(temp_frames, descriptor); + if (!provider_or_err) + return provider_or_err.takeError(); + + ClearScriptedFrameProvider(); + m_frame_provider_sp = *provider_or_err; + return llvm::Error::success(); +} + +void Thread::ClearScriptedFrameProvider() { + std::lock_guard guard(m_frame_mutex); + m_frame_provider_sp.reset(); + m_curr_frames_sp.reset(); + m_prev_frames_sp.reset(); +} + std::optional Thread::GetPreviousFrameZeroPC() { return m_prev_framezero_pc; } @@ -1466,6 +1535,7 @@ void Thread::ClearStackFrames() { m_prev_frames_sp.swap(m_curr_frames_sp); m_curr_frames_sp.reset(); + m_frame_provider_sp.reset(); m_extended_info.reset(); m_extended_info_fetched = false; } diff --git a/lldb/source/Target/ThreadSpec.cpp b/lldb/source/Target/ThreadSpec.cpp index ba4c3aa894553..624f64e3af800 100644 --- a/lldb/source/Target/ThreadSpec.cpp +++ b/lldb/source/Target/ThreadSpec.cpp @@ -19,6 +19,10 @@ const char *ThreadSpec::g_option_names[static_cast( ThreadSpec::ThreadSpec() : m_name(), m_queue_name() {} +ThreadSpec::ThreadSpec(Thread &thread) + : m_index(thread.GetIndexID()), m_tid(thread.GetID()), + m_name(thread.GetName()), m_queue_name(thread.GetQueueName()) {} + std::unique_ptr ThreadSpec::CreateFromStructuredData( const StructuredData::Dictionary &spec_dict, Status &error) { uint32_t index = UINT32_MAX; diff --git a/lldb/test/API/functionalities/scripted_frame_provider/Makefile b/lldb/test/API/functionalities/scripted_frame_provider/Makefile new file mode 100644 index 0000000000000..99998b20bcb05 --- /dev/null +++ b/lldb/test/API/functionalities/scripted_frame_provider/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/functionalities/scripted_frame_provider/TestScriptedFrameProvider.py b/lldb/test/API/functionalities/scripted_frame_provider/TestScriptedFrameProvider.py new file mode 100644 index 0000000000000..189ca2f147f9d --- /dev/null +++ b/lldb/test/API/functionalities/scripted_frame_provider/TestScriptedFrameProvider.py @@ -0,0 +1,339 @@ +""" +Test scripted frame provider functionality. +""" + +import os + +import lldb +from lldbsuite.test.lldbtest import TestBase +from lldbsuite.test import lldbutil + + +class ScriptedFrameProviderTestCase(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + def setUp(self): + TestBase.setUp(self) + self.source = "main.cpp" + + def test_replace_all_frames(self): + """Test that we can replace the entire stack.""" + self.build() + target, process, thread, bkpt = lldbutil.run_to_source_breakpoint( + self, "Break here", lldb.SBFileSpec(self.source), only_one_thread=False + ) + + # Import the test frame provider + script_path = os.path.join(self.getSourceDir(), "test_frame_providers.py") + self.runCmd("command script import " + script_path) + + # Attach the Replace provider + error = lldb.SBError() + provider_id = target.RegisterScriptedFrameProvider( + "test_frame_providers.ReplaceFrameProvider", + lldb.SBStructuredData(), + error, + ) + self.assertTrue(error.Success(), f"Failed to register provider: {error}") + self.assertNotEqual(provider_id, 0, "Provider ID should be non-zero") + + # Verify we have exactly 3 synthetic frames + self.assertEqual(thread.GetNumFrames(), 3, "Should have 3 synthetic frames") + + # Verify frame indices and PCs (dictionary-based frames don't have custom function names) + frame0 = thread.GetFrameAtIndex(0) + self.assertIsNotNone(frame0) + self.assertEqual(frame0.GetPC(), 0x1000) + + frame1 = thread.GetFrameAtIndex(1) + self.assertIsNotNone(frame1) + self.assertIn("thread_func", frame1.GetFunctionName()) + + frame2 = thread.GetFrameAtIndex(2) + self.assertIsNotNone(frame2) + self.assertEqual(frame2.GetPC(), 0x3000) + + def test_prepend_frames(self): + """Test that we can add frames before real stack.""" + self.build() + target, process, thread, bkpt = lldbutil.run_to_source_breakpoint( + self, "Break here", lldb.SBFileSpec(self.source), only_one_thread=False + ) + + # Get original frame count and PC + original_frame_count = thread.GetNumFrames() + self.assertGreaterEqual( + original_frame_count, 2, "Should have at least 2 real frames" + ) + + # Import and attach Prepend provider + script_path = os.path.join(self.getSourceDir(), "test_frame_providers.py") + self.runCmd("command script import " + script_path) + + error = lldb.SBError() + provider_id = target.RegisterScriptedFrameProvider( + "test_frame_providers.PrependFrameProvider", + lldb.SBStructuredData(), + error, + ) + self.assertTrue(error.Success(), f"Failed to register provider: {error}") + self.assertNotEqual(provider_id, 0, "Provider ID should be non-zero") + + # Verify we have 2 more frames + new_frame_count = thread.GetNumFrames() + self.assertEqual(new_frame_count, original_frame_count + 2) + + # Verify first 2 frames are synthetic (check PCs, not function names) + frame0 = thread.GetFrameAtIndex(0) + self.assertEqual(frame0.GetPC(), 0x9000) + + frame1 = thread.GetFrameAtIndex(1) + self.assertEqual(frame1.GetPC(), 0xA000) + + # Verify frame 2 is the original real frame 0 + frame2 = thread.GetFrameAtIndex(2) + self.assertIn("thread_func", frame2.GetFunctionName()) + + def test_append_frames(self): + """Test that we can add frames after real stack.""" + self.build() + target, process, thread, bkpt = lldbutil.run_to_source_breakpoint( + self, "Break here", lldb.SBFileSpec(self.source), only_one_thread=False + ) + + # Get original frame count + original_frame_count = thread.GetNumFrames() + + # Import and attach Append provider + script_path = os.path.join(self.getSourceDir(), "test_frame_providers.py") + self.runCmd("command script import " + script_path) + + error = lldb.SBError() + provider_id = target.RegisterScriptedFrameProvider( + "test_frame_providers.AppendFrameProvider", + lldb.SBStructuredData(), + error, + ) + self.assertTrue(error.Success(), f"Failed to register provider: {error}") + self.assertNotEqual(provider_id, 0, "Provider ID should be non-zero") + + # Verify we have 1 more frame + new_frame_count = thread.GetNumFrames() + self.assertEqual(new_frame_count, original_frame_count + 1) + + # Verify first frames are still real + frame0 = thread.GetFrameAtIndex(0) + self.assertIn("thread_func", frame0.GetFunctionName()) + + frame_n_plus_1 = thread.GetFrameAtIndex(new_frame_count - 1) + self.assertEqual(frame_n_plus_1.GetPC(), 0x10) + + def test_scripted_frame_objects(self): + """Test that provider can return ScriptedFrame objects.""" + self.build() + target, process, thread, bkpt = lldbutil.run_to_source_breakpoint( + self, "Break here", lldb.SBFileSpec(self.source), only_one_thread=False + ) + + # Import the provider that returns ScriptedFrame objects + script_path = os.path.join(self.getSourceDir(), "test_frame_providers.py") + self.runCmd("command script import " + script_path) + + error = lldb.SBError() + provider_id = target.RegisterScriptedFrameProvider( + "test_frame_providers.ScriptedFrameObjectProvider", + lldb.SBStructuredData(), + error, + ) + self.assertTrue(error.Success(), f"Failed to register provider: {error}") + self.assertNotEqual(provider_id, 0, "Provider ID should be non-zero") + + # Verify we have 5 frames + self.assertEqual( + thread.GetNumFrames(), 5, "Should have 5 custom scripted frames" + ) + + # Verify frame properties from CustomScriptedFrame + frame0 = thread.GetFrameAtIndex(0) + self.assertIsNotNone(frame0) + self.assertEqual(frame0.GetFunctionName(), "custom_scripted_frame_0") + self.assertEqual(frame0.GetPC(), 0x5000) + self.assertTrue(frame0.IsSynthetic(), "Frame should be marked as synthetic") + + frame1 = thread.GetFrameAtIndex(1) + self.assertIsNotNone(frame1) + self.assertEqual(frame1.GetPC(), 0x6000) + + frame2 = thread.GetFrameAtIndex(2) + self.assertIsNotNone(frame2) + self.assertEqual(frame2.GetFunctionName(), "custom_scripted_frame_2") + self.assertEqual(frame2.GetPC(), 0x7000) + self.assertTrue(frame2.IsSynthetic(), "Frame should be marked as synthetic") + + def test_applies_to_thread(self): + """Test that applies_to_thread filters which threads get the provider.""" + self.build() + target, process, thread, bkpt = lldbutil.run_to_source_breakpoint( + self, "Break here", lldb.SBFileSpec(self.source), only_one_thread=False + ) + + # We should have at least 2 threads (worker threads) at the breakpoint + num_threads = process.GetNumThreads() + self.assertGreaterEqual( + num_threads, 2, "Should have at least 2 threads at breakpoint" + ) + + # Import the test frame provider + script_path = os.path.join(self.getSourceDir(), "test_frame_providers.py") + self.runCmd("command script import " + script_path) + + # Collect original thread info before applying provider + thread_info = {} + for i in range(num_threads): + t = process.GetThreadAtIndex(i) + thread_info[t.GetIndexID()] = { + "frame_count": t.GetNumFrames(), + "pc": t.GetFrameAtIndex(0).GetPC(), + } + + # Register the ThreadFilterFrameProvider which only applies to thread ID 1 + error = lldb.SBError() + provider_id = target.RegisterScriptedFrameProvider( + "test_frame_providers.ThreadFilterFrameProvider", + lldb.SBStructuredData(), + error, + ) + self.assertTrue(error.Success(), f"Failed to register provider: {error}") + self.assertNotEqual(provider_id, 0, "Provider ID should be non-zero") + + # Check each thread + thread_id_1_found = False + for i in range(num_threads): + t = process.GetThreadAtIndex(i) + thread_id = t.GetIndexID() + + if thread_id == 1: + # Thread with ID 1 should have synthetic frame + thread_id_1_found = True + self.assertEqual( + t.GetNumFrames(), + 1, + f"Thread with ID 1 should have 1 synthetic frame", + ) + self.assertEqual( + t.GetFrameAtIndex(0).GetPC(), + 0xFFFF, + f"Thread with ID 1 should have synthetic PC 0xFFFF", + ) + else: + # Other threads should keep their original frames + self.assertEqual( + t.GetNumFrames(), + thread_info[thread_id]["frame_count"], + f"Thread with ID {thread_id} should not be affected by provider", + ) + self.assertEqual( + t.GetFrameAtIndex(0).GetPC(), + thread_info[thread_id]["pc"], + f"Thread with ID {thread_id} should have its original PC", + ) + + # We should have found at least one thread with ID 1 + self.assertTrue( + thread_id_1_found, + "Should have found a thread with ID 1 to test filtering", + ) + + def test_remove_frame_provider_by_id(self): + """Test that RemoveScriptedFrameProvider removes a specific provider by ID.""" + self.build() + target, process, thread, bkpt = lldbutil.run_to_source_breakpoint( + self, "Break here", lldb.SBFileSpec(self.source), only_one_thread=False + ) + + # Import the test frame providers + script_path = os.path.join(self.getSourceDir(), "test_frame_providers.py") + self.runCmd("command script import " + script_path) + + # Get original frame count + original_frame_count = thread.GetNumFrames() + original_pc = thread.GetFrameAtIndex(0).GetPC() + + # Register the first provider and get its ID + error = lldb.SBError() + provider_id_1 = target.RegisterScriptedFrameProvider( + "test_frame_providers.ReplaceFrameProvider", + lldb.SBStructuredData(), + error, + ) + self.assertTrue(error.Success(), f"Failed to register provider 1: {error}") + + # Verify first provider is active (3 synthetic frames) + self.assertEqual(thread.GetNumFrames(), 3, "Should have 3 synthetic frames") + self.assertEqual( + thread.GetFrameAtIndex(0).GetPC(), 0x1000, "Should have first provider's PC" + ) + + # Register a second provider and get its ID + provider_id_2 = target.RegisterScriptedFrameProvider( + "test_frame_providers.PrependFrameProvider", + lldb.SBStructuredData(), + error, + ) + self.assertTrue(error.Success(), f"Failed to register provider 2: {error}") + + # Verify IDs are different + self.assertNotEqual( + provider_id_1, provider_id_2, "Provider IDs should be unique" + ) + + # Now remove the first provider by ID + result = target.RemoveScriptedFrameProvider(provider_id_1) + self.assertSuccess( + result, f"Should successfully remove provider with ID {provider_id_1}" + ) + + # After removing the first provider, the second provider should still be active + # The PrependFrameProvider adds 2 frames before the real stack + # Since ReplaceFrameProvider had 3 frames, and we removed it, we should now + # have the original frames (from real stack) with PrependFrameProvider applied + new_frame_count = thread.GetNumFrames() + self.assertEqual( + new_frame_count, + original_frame_count + 2, + "Should have original frames + 2 prepended frames", + ) + + # First two frames should be from PrependFrameProvider + self.assertEqual( + thread.GetFrameAtIndex(0).GetPC(), + 0x9000, + "First frame should be from PrependFrameProvider", + ) + self.assertEqual( + thread.GetFrameAtIndex(1).GetPC(), + 0xA000, + "Second frame should be from PrependFrameProvider", + ) + + # Remove the second provider + result = target.RemoveScriptedFrameProvider(provider_id_2) + self.assertSuccess( + result, f"Should successfully remove provider with ID {provider_id_2}" + ) + + # After removing both providers, frames should be back to original + self.assertEqual( + thread.GetNumFrames(), + original_frame_count, + "Should restore original frame count", + ) + self.assertEqual( + thread.GetFrameAtIndex(0).GetPC(), + original_pc, + "Should restore original PC", + ) + + # Try to remove a provider that doesn't exist + result = target.RemoveScriptedFrameProvider(999999) + self.assertTrue(result.Fail(), "Should fail to remove non-existent provider") diff --git a/lldb/test/API/functionalities/scripted_frame_provider/main.cpp b/lldb/test/API/functionalities/scripted_frame_provider/main.cpp new file mode 100644 index 0000000000000..f15cb282f9d25 --- /dev/null +++ b/lldb/test/API/functionalities/scripted_frame_provider/main.cpp @@ -0,0 +1,55 @@ +// Multi-threaded test program for testing frame providers. + +#include +#include +#include +#include + +std::mutex mtx; +std::condition_variable cv; +int ready_count = 0; +constexpr int NUM_THREADS = 2; + +void thread_func(int thread_num) { + std::cout << "Thread " << thread_num << " started\n"; + + { + std::unique_lock lock(mtx); + ready_count++; + if (ready_count == NUM_THREADS + 1) { + cv.notify_all(); + } else { + cv.wait(lock, [] { return ready_count == NUM_THREADS + 1; }); + } + } + + std::cout << "Thread " << thread_num << " at breakpoint\n"; // Break here +} + +int main(int argc, char **argv) { + std::thread threads[NUM_THREADS]; + + for (int i = 0; i < NUM_THREADS; i++) { + threads[i] = std::thread(thread_func, i); + } + + { + std::unique_lock lock(mtx); + ready_count++; + if (ready_count == NUM_THREADS + 1) { + cv.notify_all(); + } else { + cv.wait(lock, [] { return ready_count == NUM_THREADS + 1; }); + } + } + + std::cout << "Main thread at barrier\n"; + + // Join threads + for (int i = 0; i < NUM_THREADS; i++) { + threads[i].join(); + } + + std::cout << "All threads completed\n"; + return 0; +} diff --git a/lldb/test/API/functionalities/scripted_frame_provider/test_frame_providers.py b/lldb/test/API/functionalities/scripted_frame_provider/test_frame_providers.py new file mode 100644 index 0000000000000..91aa13e44339a --- /dev/null +++ b/lldb/test/API/functionalities/scripted_frame_provider/test_frame_providers.py @@ -0,0 +1,176 @@ +""" +Test frame providers for scripted frame provider functionality. + +These providers demonstrate various merge strategies: +- Replace: Replace entire stack +- Prepend: Add frames before real stack +- Append: Add frames after real stack + +It also shows the ability to mix a dictionary, a ScriptedFrame or an SBFrame +index to create stackframes +""" + +import lldb +from lldb.plugins.scripted_process import ScriptedFrame +from lldb.plugins.scripted_frame_provider import ScriptedFrameProvider + + +class ReplaceFrameProvider(ScriptedFrameProvider): + """Replace entire stack with custom frames.""" + + def __init__(self, input_frames, args): + super().__init__(input_frames, args) + self.frames = [ + { + "idx": 0, + "pc": 0x1000, + }, + 0, + { + "idx": 2, + "pc": 0x3000, + }, + ] + + @staticmethod + def get_description(): + """Return a description of this provider.""" + return "Replace entire stack with 3 custom frames" + + def get_frame_at_index(self, index): + if index >= len(self.frames): + return None + return self.frames[index] + + +class PrependFrameProvider(ScriptedFrameProvider): + """Prepend synthetic frames before real stack.""" + + def __init__(self, input_frames, args): + super().__init__(input_frames, args) + + @staticmethod + def get_description(): + """Return a description of this provider.""" + return "Prepend 2 synthetic frames before real stack" + + def get_frame_at_index(self, index): + if index == 0: + return {"pc": 0x9000} + elif index == 1: + return {"pc": 0xA000} + elif index - 2 < len(self.input_frames): + return index - 2 # Return real frame index + return None + + +class AppendFrameProvider(ScriptedFrameProvider): + """Append synthetic frames after real stack.""" + + def __init__(self, input_frames, args): + super().__init__(input_frames, args) + + @staticmethod + def get_description(): + """Return a description of this provider.""" + return "Append 1 synthetic frame after real stack" + + def get_frame_at_index(self, index): + if index < len(self.input_frames): + return index # Return real frame index + elif index == len(self.input_frames): + return { + "idx": 1, + "pc": 0x10, + } + return None + + +class CustomScriptedFrame(ScriptedFrame): + """Custom scripted frame with full control over frame behavior.""" + + def __init__(self, thread, idx, pc, function_name): + # Initialize structured data args + args = lldb.SBStructuredData() + super().__init__(thread, args) + + self.idx = idx + self.pc = pc + self.function_name = function_name + + def get_id(self): + """Return the frame index.""" + return self.idx + + def get_pc(self): + """Return the program counter.""" + return self.pc + + def get_function_name(self): + """Return the function name.""" + return self.function_name + + def is_artificial(self): + """Mark as artificial frame.""" + return False + + def is_hidden(self): + """Not hidden.""" + return False + + def get_register_context(self): + """No register context for this test.""" + return None + + +class ScriptedFrameObjectProvider(ScriptedFrameProvider): + """Provider that returns ScriptedFrame objects instead of dictionaries.""" + + def __init__(self, input_frames, args): + super().__init__(input_frames, args) + + @staticmethod + def get_description(): + """Return a description of this provider.""" + return "Provider returning custom ScriptedFrame objects" + + def get_frame_at_index(self, index): + """Return ScriptedFrame objects or dictionaries based on index.""" + if index == 0: + return CustomScriptedFrame( + self.thread, 0, 0x5000, "custom_scripted_frame_0" + ) + elif index == 1: + return {"pc": 0x6000} + elif index == 2: + return CustomScriptedFrame( + self.thread, 2, 0x7000, "custom_scripted_frame_2" + ) + elif index == 3: + return len(self.input_frames) - 2 # Real frame index + elif index == 4: + return len(self.input_frames) - 1 # Real frame index + return None + + +class ThreadFilterFrameProvider(ScriptedFrameProvider): + """Provider that only applies to thread with ID 1.""" + + @staticmethod + def applies_to_thread(thread): + """Only apply to thread with index ID 1.""" + return thread.GetIndexID() == 1 + + def __init__(self, input_frames, args): + super().__init__(input_frames, args) + + @staticmethod + def get_description(): + """Return a description of this provider.""" + return "Provider that only applies to thread ID 1" + + def get_frame_at_index(self, index): + """Return a single synthetic frame.""" + if index == 0: + return {"pc": 0xFFFF} + return None diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp index a63b740d9472f..5694aeeff3e5b 100644 --- a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp +++ b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp @@ -136,6 +136,11 @@ lldb_private::python::LLDBSWIGPython_CastPyObjectToSBStream(PyObject *data) { return nullptr; } +void * +lldb_private::python::LLDBSWIGPython_CastPyObjectToSBThread(PyObject *data) { + return nullptr; +} + void * lldb_private::python::LLDBSWIGPython_CastPyObjectToSBFrame(PyObject *data) { return nullptr; From 9697f4b9e43151f8797ec199ec9a1fce2d208d7d Mon Sep 17 00:00:00 2001 From: Hongyu Chen Date: Wed, 12 Nov 2025 04:33:41 +0800 Subject: [PATCH 45/64] [WebAssembly][FastISel] Bail out on meeting non-integer type in selectTrunc (#167165) Fixes https://github.com/llvm/llvm-project/issues/165438 With `simd128` enabled, we may meet vector type truncation in FastISel. To respect #138479, this patch merely bails out on non-integer IR types, though I prefer bailing out for all non-simple types as most targets (X86, AArch64) do. --- .../WebAssembly/WebAssemblyFastISel.cpp | 38 +++++++++++++------ .../CodeGen/WebAssembly/fast-isel-pr138479.ll | 5 ++- .../CodeGen/WebAssembly/fast-isel-simd128.ll | 23 +++++++++++ 3 files changed, 54 insertions(+), 12 deletions(-) create mode 100644 llvm/test/CodeGen/WebAssembly/fast-isel-simd128.ll diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 66ed8b078b808..9d8e09c09e9ea 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -988,20 +988,36 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) { bool WebAssemblyFastISel::selectTrunc(const Instruction *I) { const auto *Trunc = cast(I); - Register Reg = getRegForValue(Trunc->getOperand(0)); - if (Reg == 0) + const Value *Op = Trunc->getOperand(0); + MVT::SimpleValueType From = getSimpleType(Op->getType()); + MVT::SimpleValueType To = getLegalType(getSimpleType(Trunc->getType())); + Register In = getRegForValue(Op); + if (In == 0) return false; - unsigned FromBitWidth = Trunc->getOperand(0)->getType()->getIntegerBitWidth(); - unsigned ToBitWidth = Trunc->getType()->getIntegerBitWidth(); + auto Truncate = [&](Register Reg) -> unsigned { + if (From == MVT::i64) { + if (To == MVT::i64) + return copyValue(Reg); + + if (To == MVT::i1 || To == MVT::i8 || To == MVT::i16 || To == MVT::i32) { + Register Result = createResultReg(&WebAssembly::I32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, + TII.get(WebAssembly::I32_WRAP_I64), Result) + .addReg(Reg); + return Result; + } + } - if (ToBitWidth <= 32 && (32 < FromBitWidth && FromBitWidth <= 64)) { - Register Result = createResultReg(&WebAssembly::I32RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, - TII.get(WebAssembly::I32_WRAP_I64), Result) - .addReg(Reg); - Reg = Result; - } + if (From == MVT::i32) + return copyValue(Reg); + + return 0; + }; + + unsigned Reg = Truncate(In); + if (Reg == 0) + return false; updateValueMap(Trunc, Reg); return true; diff --git a/llvm/test/CodeGen/WebAssembly/fast-isel-pr138479.ll b/llvm/test/CodeGen/WebAssembly/fast-isel-pr138479.ll index 2676000b968c3..1eb50d5f9564a 100644 --- a/llvm/test/CodeGen/WebAssembly/fast-isel-pr138479.ll +++ b/llvm/test/CodeGen/WebAssembly/fast-isel-pr138479.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -asm-verbose=false -fast-isel -fast-isel-abort=1 -verify-machineinstrs | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -asm-verbose=false -fast-isel -fast-isel-abort=0 -verify-machineinstrs | FileCheck %s target triple = "wasm32-unknown-unknown" @@ -13,3 +14,5 @@ define void @call_trunc_i64_to_i48(i64 %x) { call void @extern48(i48 %x48) ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/WebAssembly/fast-isel-simd128.ll b/llvm/test/CodeGen/WebAssembly/fast-isel-simd128.ll new file mode 100644 index 0000000000000..df14e1054d91b --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/fast-isel-simd128.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -fast-isel -fast-isel-abort=0 -mattr=+simd128 -verify-machineinstrs | FileCheck %s + +target triple = "wasm32-unknown-unknown" + +define i8 @pr165438(<4 x i32> %0) { +; CHECK-LABEL: pr165438: +; CHECK: .functype pr165438 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: i8x16.extract_lane_u 0 +; CHECK-NEXT: # fallthrough-return +entry: + %conv = trunc <4 x i32> %0 to <4 x i8> + br label %cond.true + + +cond.true: ; preds = %entry + %vecext = extractelement <4 x i8> %conv, i32 0 + ret i8 %vecext +} From 810d99335baf80c668c528d5abe9169da3214651 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Wed, 5 Nov 2025 18:28:53 +0000 Subject: [PATCH 46/64] [Github] Allow Premerge to use issue-write workflow We want the premerge advisor to write out comments, and we need the issue-write workflow to trigger on it in order for this to work. Landing this before the rest of #166609 to enable testing that given this needs to be in repo due to permissions issues. --- .github/workflows/issue-write.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/issue-write.yml b/.github/workflows/issue-write.yml index 8a083f9143ec6..4f8fd7a48aff6 100644 --- a/.github/workflows/issue-write.yml +++ b/.github/workflows/issue-write.yml @@ -7,6 +7,7 @@ on: - "Check for private emails used in PRs" - "PR Request Release Note" - "Code lint" + - "CI Checks" types: - completed From c41ef17653d7e2eea2435abdf3e963406d270c85 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 11 Nov 2025 21:05:14 +0000 Subject: [PATCH 47/64] [VPlan] Add getSingleUser helper (NFC). Add helper to make it easier to retrieve the single user of a VPUser. --- .../Transforms/Vectorize/LoopVectorize.cpp | 27 +++++++++---------- .../Transforms/Vectorize/VPlanTransforms.cpp | 20 ++++++++------ llvm/lib/Transforms/Vectorize/VPlanValue.h | 7 +++++ 3 files changed, 31 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 345bc63081b81..14cea1c8fa67f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4226,18 +4226,16 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { // Selects are only modelled in the legacy cost model for safe // divisors. case Instruction::Select: { - VPValue *VPV = VPI->getVPSingleValue(); - if (VPV->getNumUsers() == 1) { - if (auto *WR = dyn_cast(*VPV->user_begin())) { - switch (WR->getOpcode()) { - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::URem: - case Instruction::SRem: - continue; - default: - break; - } + if (auto *WR = + dyn_cast_or_null(VPI->getSingleUser())) { + switch (WR->getOpcode()) { + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + continue; + default: + break; } } C += VPI->cost(VF, CostCtx); @@ -6976,11 +6974,10 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, // the more accurate VPlan-based cost model. for (VPRecipeBase &R : *Plan.getVectorPreheader()) { auto *VPI = dyn_cast(&R); - if (!VPI || VPI->getOpcode() != Instruction::Select || - VPI->getNumUsers() != 1) + if (!VPI || VPI->getOpcode() != Instruction::Select) continue; - if (auto *WR = dyn_cast(*VPI->user_begin())) { + if (auto *WR = dyn_cast_or_null(VPI->getSingleUser())) { switch (WR->getOpcode()) { case Instruction::UDiv: case Instruction::SDiv: diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 1b9bc4cc45163..a6557141c47af 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -580,10 +580,13 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) { // Check if R is a dead VPPhi <-> update cycle and remove it. auto *PhiR = dyn_cast(&R); - if (!PhiR || PhiR->getNumOperands() != 2 || PhiR->getNumUsers() != 1) + if (!PhiR || PhiR->getNumOperands() != 2) + continue; + VPUser *PhiUser = PhiR->getSingleUser(); + if (!PhiUser) continue; VPValue *Incoming = PhiR->getOperand(1); - if (*PhiR->user_begin() != Incoming->getDefiningRecipe() || + if (PhiUser != Incoming->getDefiningRecipe() || Incoming->getNumUsers() != 1) continue; PhiR->replaceAllUsesWith(PhiR->getOperand(0)); @@ -1307,7 +1310,7 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { isa(X)) { auto *Phi = cast(X); if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) && - Phi->getNumUsers() == 1 && (*Phi->user_begin() == Def)) { + Phi->getSingleUser() == Def) { Phi->setOperand(0, Y); Def->replaceAllUsesWith(Phi); return; @@ -1592,10 +1595,11 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, // Currently only handle cases where the single user is a header-mask // comparison with the backedge-taken-count. - if (!match(*WideIV->user_begin(), - m_ICmp(m_Specific(WideIV), - m_Broadcast( - m_Specific(Plan.getOrCreateBackedgeTakenCount()))))) + VPUser *SingleUser = WideIV->getSingleUser(); + if (!SingleUser || + !match(SingleUser, m_ICmp(m_Specific(WideIV), + m_Broadcast(m_Specific( + Plan.getOrCreateBackedgeTakenCount()))))) continue; // Update IV operands and comparison bound to use new narrower type. @@ -1607,7 +1611,7 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, auto *NewBTC = new VPWidenCastRecipe( Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy); Plan.getVectorPreheader()->appendRecipe(NewBTC); - auto *Cmp = cast(*WideIV->user_begin()); + auto *Cmp = cast(WideIV->getSingleUser()); Cmp->setOperand(1, NewBTC); MadeChange = true; diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 5da74630ef626..09fdf5a731816 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -150,6 +150,13 @@ class LLVM_ABI_FOR_TEST VPValue { bool hasOneUse() const { return getNumUsers() == 1; } + /// Return the single user of this value, or nullptr if there is not exactly + /// one user. + VPUser *getSingleUser() { return hasOneUse() ? *user_begin() : nullptr; } + const VPUser *getSingleUser() const { + return hasOneUse() ? *user_begin() : nullptr; + } + void replaceAllUsesWith(VPValue *New); /// Go through the uses list for this VPValue and make each use point to \p From 2f7a5f7ffdff0660d465bf6024177529bc857de7 Mon Sep 17 00:00:00 2001 From: Raul Tambre Date: Tue, 11 Nov 2025 23:13:30 +0200 Subject: [PATCH 48/64] [NFCI][lldb][test] Avoid GNU extension for specifying mangling (#167221) `asm()` on function declarations is used for specifying the mangling. But that specific spelling is a GNU extension unlike `__asm()`. Found by building with `-std=c2y` in Clang's C frontend's config file. --- lldb/test/Shell/Unwind/Inputs/call-asm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lldb/test/Shell/Unwind/Inputs/call-asm.c b/lldb/test/Shell/Unwind/Inputs/call-asm.c index b154c1ac1385d..778c16b36a761 100644 --- a/lldb/test/Shell/Unwind/Inputs/call-asm.c +++ b/lldb/test/Shell/Unwind/Inputs/call-asm.c @@ -1,3 +1,3 @@ -int asm_main() asm("asm_main"); - +// Explicit mangling is necessary as on Darwin an underscore is prepended to the symbol. +int asm_main() __asm("asm_main"); int main() { return asm_main(); } From 2d8563f4b880184c001da00decd92f2b0d6befe6 Mon Sep 17 00:00:00 2001 From: Saleem Abdulrasool Date: Tue, 11 Nov 2025 13:37:21 -0800 Subject: [PATCH 49/64] AArch64: align pair-wise spills on WoS to 16-byte (#166902) Adjust the frame setup code for Windows ARM64 to attempt to align pair-wise spills to 16-byte boundaries. This enables us to properly emit the spills for custom clang calling convensions such as preserve most which spills r9-r15 which are normally nonvolatile registers. Even when using the ARM64EC opcodes for the unwinding, we cannot represent the spill if it is unaligned. --- .../Target/AArch64/AArch64FrameLowering.cpp | 54 +++++++++++++++---- .../CodeGen/AArch64/seh-extended-spills.ll | 48 +++++++++++------ 2 files changed, 75 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 70c5c29149288..de55704a37531 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1554,8 +1554,10 @@ static bool produceCompactUnwindFrame(const AArch64FrameLowering &AFL, !AFL.requiresSaveVG(MF) && !AFI->isSVECC(); } -static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, - bool NeedsWinCFI, bool IsFirst, +static bool invalidateWindowsRegisterPairing(bool SpillExtendedVolatile, + unsigned SpillCount, unsigned Reg1, + unsigned Reg2, bool NeedsWinCFI, + bool IsFirst, const TargetRegisterInfo *TRI) { // If we are generating register pairs for a Windows function that requires // EH support, then pair consecutive registers only. There are no unwind @@ -1568,8 +1570,18 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, return true; if (!NeedsWinCFI) return false; + + // ARM64EC introduced `save_any_regp`, which expects 16-byte alignment. + // This is handled by only allowing paired spills for registers spilled at + // even positions (which should be 16-byte aligned, as other GPRs/FPRs are + // 8-bytes). We carve out an exception for {FP,LR}, which does not require + // 16-byte alignment in the uop representation. if (TRI->getEncodingValue(Reg2) == TRI->getEncodingValue(Reg1) + 1) - return false; + return SpillExtendedVolatile + ? !((Reg1 == AArch64::FP && Reg2 == AArch64::LR) || + (SpillCount % 2) == 0) + : false; + // If pairing a GPR with LR, the pair can be described by the save_lrpair // opcode. If this is the first register pair, it would end up with a // predecrement, but there's no save_lrpair_x opcode, so we can only do this @@ -1585,12 +1597,15 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, /// WindowsCFI requires that only consecutive registers can be paired. /// LR and FP need to be allocated together when the frame needs to save /// the frame-record. This means any other register pairing with LR is invalid. -static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2, - bool UsesWinAAPCS, bool NeedsWinCFI, - bool NeedsFrameRecord, bool IsFirst, +static bool invalidateRegisterPairing(bool SpillExtendedVolatile, + unsigned SpillCount, unsigned Reg1, + unsigned Reg2, bool UsesWinAAPCS, + bool NeedsWinCFI, bool NeedsFrameRecord, + bool IsFirst, const TargetRegisterInfo *TRI) { if (UsesWinAAPCS) - return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst, + return invalidateWindowsRegisterPairing(SpillExtendedVolatile, SpillCount, + Reg1, Reg2, NeedsWinCFI, IsFirst, TRI); // If we need to store the frame record, don't pair any register @@ -1688,6 +1703,21 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL, } bool FPAfterSVECalleeSaves = IsWindows && AFI->getSVECalleeSavedStackSize(); + // Windows AAPCS has x9-x15 as volatile registers, x16-x17 as intra-procedural + // scratch, x18 as platform reserved. However, clang has extended calling + // convensions such as preserve_most and preserve_all which treat these as + // CSR. As such, the ARM64 unwind uOPs bias registers by 19. We use ARM64EC + // uOPs which have separate restrictions. We need to check for that. + // + // NOTE: we currently do not account for the D registers as LLVM does not + // support non-ABI compliant D register spills. + bool SpillExtendedVolatile = + IsWindows && std::any_of(std::begin(CSI), std::end(CSI), + [](const CalleeSavedInfo &CSI) { + const auto &Reg = CSI.getReg(); + return Reg >= AArch64::X0 && + Reg <= AArch64::X18; + }); int ZPRByteOffset = 0; int PPRByteOffset = 0; @@ -1749,17 +1779,19 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL, if (unsigned(i + RegInc) < Count && !HasCSHazardPadding) { MCRegister NextReg = CSI[i + RegInc].getReg(); bool IsFirst = i == FirstReg; + unsigned SpillCount = NeedsWinCFI ? FirstReg - i : i; switch (RPI.Type) { case RegPairInfo::GPR: if (AArch64::GPR64RegClass.contains(NextReg) && - !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows, - NeedsWinCFI, NeedsFrameRecord, IsFirst, - TRI)) + !invalidateRegisterPairing( + SpillExtendedVolatile, SpillCount, RPI.Reg1, NextReg, IsWindows, + NeedsWinCFI, NeedsFrameRecord, IsFirst, TRI)) RPI.Reg2 = NextReg; break; case RegPairInfo::FPR64: if (AArch64::FPR64RegClass.contains(NextReg) && - !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI, + !invalidateWindowsRegisterPairing(SpillExtendedVolatile, SpillCount, + RPI.Reg1, NextReg, NeedsWinCFI, IsFirst, TRI)) RPI.Reg2 = NextReg; break; diff --git a/llvm/test/CodeGen/AArch64/seh-extended-spills.ll b/llvm/test/CodeGen/AArch64/seh-extended-spills.ll index ecc22703ef584..54f8e3f4c5a64 100644 --- a/llvm/test/CodeGen/AArch64/seh-extended-spills.ll +++ b/llvm/test/CodeGen/AArch64/seh-extended-spills.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple aarch64-unknown-windows-msvc -filetype asm -o - %s | FileCheck %s +; RUN: llc -mtriple aarch64-unknown-windows-msvc -filetype obj -o - %s | llvm-readobj -u - | FileCheck %s -check-prefix CHECK-UNWIND declare dso_local void @g(ptr noundef) define dso_local preserve_mostcc void @f(ptr noundef %p) #0 { @@ -12,23 +13,38 @@ entry: attributes #0 = { nounwind uwtable(sync) } -; CHECK: stp x9, x10, [sp, #[[OFFSET_0:[0-9]+]]] -; CHECK-NEXT: .seh_save_any_reg_p x9, [[OFFSET_0]] -; CHECK: stp x11, x12, [sp, #[[OFFSET_1:[0-9]+]]] -; CHECK-NEXT: .seh_save_any_reg_p x11, [[OFFSET_1]] -; CHECK: stp x13, x14, [sp, #[[OFFSET_2:[0-9]+]]] -; CHECK-NEXT: .seh_save_any_reg_p x13, [[OFFSET_2]] -; CHECK: str x15, [sp, #[[OFFSET_3:[0-9]+]]] -; CHECK-NEXT: .seh_save_any_reg x15, [[OFFSET_3]] +; CHECK: str x30, [sp, #16] +; CHECK-NEXT: .seh_save_reg x30, 16 +; CHECK: str x9, [sp, #24] +; CHECK-NEXT: .seh_save_any_reg x9, 24 +; CHECK: stp x10, x11, [sp, #32 +; CHECK-NEXT: .seh_save_any_reg_p x10, 32 +; CHECK: stp x12, x13, [sp, #48] +; CHECK-NEXT: .seh_save_any_reg_p x12, 48 +; CHECK: stp x14, x15, [sp, #64] +; CHECK-NEXT: .seh_save_any_reg_p x14, 64 ; CHECK: .seh_endprologue ; CHECK: .seh_startepilogue -; CHECK: ldr x15, [sp, #[[OFFSET_3]]] -; CHECK-NEXT: .seh_save_any_reg x15, [[OFFSET_3]] -; CHECK: ldp x13, x14, [sp, #[[OFFSET_2]]] -; CHECK-NEXT: .seh_save_any_reg_p x13, [[OFFSET_2]] -; CHECK: ldp x11, x12, [sp, #[[OFFSET_1]]] -; CHECK-NEXT: .seh_save_any_reg_p x11, [[OFFSET_1]] -; CHECK: ldp x9, x10, [sp, #[[OFFSET_0]]] -; CHECK-NEXT: .seh_save_any_reg_p x9, [[OFFSET_0]] +; CHECK: ldp x14, x15, [sp, #64] +; CHECK-NEXT: .seh_save_any_reg_p x14, 64 +; CHECK: ldp x12, x13, [sp, #48] +; CHECK-NEXT: .seh_save_any_reg_p x12, 48 +; CHECK: ldp x10, x11, [sp, #32 +; CHECK-NEXT: .seh_save_any_reg_p x10, 32 +; CHECK: ldr x9, [sp, #24] +; CHECK-NEXT: .seh_save_any_reg x9, 24 +; CHECK: ldr x30, [sp, #16] +; CHECK-NEXT: .seh_save_reg x30, 16 + ; CHECK: .seh_endepilogue + +; CHECK-UNWIND: Prologue [ +; CHECK-UNWIND: 0xe74e04 ; stp x14, x15, [sp, #64] +; CHECK-UNWIND: 0xe74c03 ; stp x12, x13, [sp, #48] +; CHECK-UNWIND: 0xe74a02 ; stp x10, x11, [sp, #32] +; CHECK-UNWIND: 0xe70903 ; str x9, [sp, #24] +; CHECK-UNWIND: 0xd2c2 ; str x30, [sp, #16] +; CHECK-UNWIND: 0x05 ; sub sp, #80 +; CHECK-UNWIND: 0xe4 ; end +; CHECK-UNWIND: ] From d6c750b36ac73029bce9f1de6c976eb787c55253 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 11 Nov 2025 13:50:27 -0800 Subject: [PATCH 50/64] PPC: Disable type checking in xfailed sincospi test (#167563) This hangs in expensive_checks --- llvm/test/CodeGen/PowerPC/llvm.sincospi.ppcfp128.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/PowerPC/llvm.sincospi.ppcfp128.ll b/llvm/test/CodeGen/PowerPC/llvm.sincospi.ppcfp128.ll index bc656bb785e9e..4fbb6a07aa37d 100644 --- a/llvm/test/CodeGen/PowerPC/llvm.sincospi.ppcfp128.ll +++ b/llvm/test/CodeGen/PowerPC/llvm.sincospi.ppcfp128.ll @@ -1,6 +1,6 @@ ; XFAIL: * ; FIXME: asserts -; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-gnu-linux -filetype=null \ +; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-gnu-linux -filetype=null -enable-legalize-types-checking=0 \ ; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names %s define { ppc_fp128, ppc_fp128 } @test_sincospi_ppcf128(ppc_fp128 %a) { From bbde792786dc93fc07cf245dd118f9d8b018de11 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 11 Nov 2025 13:50:57 -0800 Subject: [PATCH 51/64] AMDGPU: Relax shouldCoalesce to allow more register tuple widening (#166475) Allow widening up to 128-bit registers or if the new register class is at least as large as one of the existing register classes. This was artificially limiting. In particular this was doing the wrong thing with sequences involving copies between VGPRs and AV registers. Nearly all test changes are improvements. The coalescer does not just widen registers out of nowhere. If it's trying to "widen" a register, it's generally packing a register into an existing register tuple, or in a situation where the constraints imply the wider class anyway. 067a11015 addressed the allocation failure concern by rejecting coalescing if there are no available registers. The original change in a4e63ead4b didn't include a realistic testcase to judge if this is harmful for pressure. I would expect any issues from this to be of garden variety subreg handling issue. We could use more dynamic state information here if it really is an issue. I get the best results by removing this override completely. This is a smaller step for patch splitting purposes. --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 15 +- .../AMDGPU/GlobalISel/mul-known-bits.i64.ll | 185 +- llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 930 +++--- .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 2395 +++++++------- .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 935 +++--- .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 2822 ++++++++--------- .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 668 ++-- .../atomic_optimizations_global_pointer.ll | 21 +- .../atomic_optimizations_local_pointer.ll | 27 +- ...ffer-fat-pointers-contents-legalization.ll | 6 +- llvm/test/CodeGen/AMDGPU/bypass-div.ll | 166 +- llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll | 10 +- .../CodeGen/AMDGPU/div-rem-by-constant-64.ll | 60 +- llvm/test/CodeGen/AMDGPU/div_v2i128.ll | 265 +- llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 666 ++-- .../hazard-recognizer-src-shared-base.ll | 6 +- .../identical-subrange-spill-infloop.ll | 12 +- .../CodeGen/AMDGPU/integer-mad-patterns.ll | 559 ++-- .../test/CodeGen/AMDGPU/lds-misaligned-bug.ll | 197 +- llvm/test/CodeGen/AMDGPU/limit-coalesce.mir | 56 +- llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll | 20 +- llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 33 +- llvm/test/CodeGen/AMDGPU/load-local-i16.ll | 115 +- llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 72 +- llvm/test/CodeGen/AMDGPU/mul.ll | 169 +- llvm/test/CodeGen/AMDGPU/rem_i128.ll | 66 +- .../AMDGPU/shufflevector.v2f32.v2f32.ll | 64 +- .../AMDGPU/shufflevector.v2f32.v3f32.ll | 520 +-- .../AMDGPU/shufflevector.v2i32.v2i32.ll | 64 +- .../AMDGPU/shufflevector.v2i32.v3i32.ll | 520 +-- .../CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll | 64 +- .../CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll | 520 +-- .../AMDGPU/shufflevector.v3f32.v2f32.ll | 888 ++---- .../AMDGPU/shufflevector.v3f32.v3f32.ll | 420 ++- .../AMDGPU/shufflevector.v3i32.v2i32.ll | 888 ++---- .../AMDGPU/shufflevector.v3i32.v3i32.ll | 420 ++- .../CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll | 888 ++---- .../CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll | 420 ++- .../AMDGPU/shufflevector.v4bf16.v3bf16.ll | 124 +- .../AMDGPU/shufflevector.v4bf16.v4bf16.ll | 87 +- .../AMDGPU/shufflevector.v4f16.v3f16.ll | 124 +- .../AMDGPU/shufflevector.v4f16.v4f16.ll | 87 +- .../AMDGPU/shufflevector.v4f32.v3f32.ll | 65 +- .../AMDGPU/shufflevector.v4i16.v3i16.ll | 124 +- .../AMDGPU/shufflevector.v4i16.v4i16.ll | 87 +- .../AMDGPU/shufflevector.v4i32.v3i32.ll | 65 +- .../CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll | 65 +- llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll | 697 ++-- 48 files changed, 8259 insertions(+), 9418 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 7559efffffa54..ff3491d193460 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3741,18 +3741,11 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, unsigned DstSubReg, const TargetRegisterClass *NewRC, LiveIntervals &LIS) const { - unsigned SrcSize = getRegSizeInBits(*SrcRC); - unsigned DstSize = getRegSizeInBits(*DstRC); + // TODO: This should be more aggressive, but be more cautious with very wide + // tuples. unsigned NewSize = getRegSizeInBits(*NewRC); - - // Do not increase size of registers beyond dword, we would need to allocate - // adjacent registers and constraint regalloc more than needed. - - // Always allow dword coalescing. - if (SrcSize <= 32 || DstSize <= 32) - return true; - - return NewSize <= DstSize || NewSize <= SrcSize; + return NewSize <= 128 || NewSize <= getRegSizeInBits(*SrcRC) || + NewSize <= getRegSizeInBits(*DstRC); } unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll index 2351c969d5e49..9a90faf723461 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -8,17 +8,16 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac ; GFX10-LABEL: v_mul_i64_no_zext: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[0:1] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0 -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6] -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v2, v4, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, v2, v5, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, v3, v4, v[1:2] +; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_no_zext: @@ -26,19 +25,17 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v9, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v9, s[0:1] -; GFX11-NEXT: global_load_b64 v[2:3], v9, s[2:3] +; GFX11-NEXT: global_load_b64 v[2:3], v8, s[0:1] +; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v0, v2, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v3, v[5:6] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v1, v2, v[6:7] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v5, v7 -; GFX11-NEXT: global_store_b64 v9, v[4:5], s[2:3] +; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v2, v5, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v4, v[6:7] +; GFX11-NEXT: global_store_b64 v8, v[0:1], s[2:3] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -58,18 +55,16 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX10-NEXT: global_load_dword v4, v3, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v1, s[2:3] +; GFX10-NEXT: global_load_dword v4, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v0, v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v4, v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v2, v4, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v3, v4, v[1:2] +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_zext_src1: @@ -80,17 +75,17 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v1, s[2:3] -; GFX11-NEXT: global_load_b32 v5, v2, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v1, s[2:3] +; GFX11-NEXT: global_load_b32 v5, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v5, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v5, v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -110,18 +105,16 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v2, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] +; GFX10-NEXT: global_load_dword v4, v1, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v2, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v3, v[1:2] +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_zext_src0: @@ -135,14 +128,14 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid @@ -209,18 +202,16 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v4, v2, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX10-NEXT: global_load_dword v4, v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v2, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v3, v[1:2] +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_masked_src0_hi: @@ -234,14 +225,14 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -389,22 +380,20 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[1:2], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_e32 v6, 0xfff00000, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 0xfff00000, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v6, v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s2, v6, v3, v[0:1] -; GFX10-NEXT: v_and_b32_e32 v0, 0xf00f, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v0, v2, v[5:6] -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[4:5], s[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v5, v3, 0 +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v5, v4, v[1:2] +; GFX10-NEXT: v_and_b32_e32 v1, 0xf00f, v2 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v1, v3, v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_partially_masked_src0: @@ -414,24 +403,22 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] -; GFX11-NEXT: global_load_b64 v[2:3], v2, s[4:5] +; GFX11-NEXT: global_load_b64 v[1:2], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[3:4], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v0 +; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v7, v2, 0 -; GFX11-NEXT: v_mov_b32_e32 v0, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v3, v[0:1] -; GFX11-NEXT: v_and_b32_e32 v3, 0xf00f, v1 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[5:6] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v7, v3, 0 +; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v4, v[1:2] +; GFX11-NEXT: v_and_b32_e32 v4, 0xf00f, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: global_store_b64 v0, v[4:5], s[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v4, v3, v[5:6] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -536,28 +523,28 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[2:3], v0, s[2:3] -; GFX11-NEXT: global_load_b64 v[4:5], v0, s[4:5] +; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[5:6], v0, s[4:5] ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[2:3] +; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[3:4] ; GFX11-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %else ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v5, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v2, v5, v[1:2] -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX11-NEXT: v_mov_b32_e32 v1, v3 -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v6, v[4:5] +; GFX11-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX11-NEXT: .LBB10_2: ; %Flow ; GFX11-NEXT: s_and_not1_saveexec_b32 s2, s2 ; GFX11-NEXT: s_cbranch_execz .LBB10_4 ; GFX11-NEXT: ; %bb.3: ; %if ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mul_lo_u32 v1, v2, v5 +; GFX11-NEXT: v_mul_lo_u32 v1, v3, v6 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: .LBB10_4: ; %endif ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 7f10ee4c17450..3eecaccf0308f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -741,12 +741,13 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GCN-NEXT: v_mov_b32_e32 v6, v0 ; GCN-NEXT: v_mov_b32_e32 v7, v1 ; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0 -; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v4, v[0:1] -; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0 -; GCN-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v3, v[8:9] -; GCN-NEXT: v_mov_b32_e32 v2, v10 -; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v4, v[1:2] -; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9] +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v9, v3 +; GCN-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v7, v4, v[0:1] +; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 +; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v9, v[10:11] +; GCN-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v4, v[1:2] +; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v9, v[10:11] ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i96: @@ -754,26 +755,26 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 ; GFX10-NEXT: v_mov_b32_e32 v7, v1 +; GFX10-NEXT: v_mov_b32_e32 v8, v3 ; GFX10-NEXT: v_mul_lo_u32 v0, v6, v5 -; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v7, v4, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v3, 0 -; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v2, v3, v[8:9] -; GFX10-NEXT: v_mov_b32_e32 v2, v8 -; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v6, v4, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[8:9] +; GFX10-NEXT: v_mad_u64_u32 v[9:10], s4, v7, v4, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v8, 0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v2, v8, v[9:10] +; GFX10-NEXT: v_mad_u64_u32 v[9:10], s4, v6, v4, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v8, v[9:10] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_mul_i96: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 +; GFX11-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v9, v3 ; GFX11-NEXT: v_mul_lo_u32 v0, v6, v5 -; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v7, v4, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v3, 0 -; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v2, v3, v[8:9] -; GFX11-NEXT: v_mov_b32_e32 v2, v9 -; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v6, v4, v[1:2] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[8:9] +; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v7, v4, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v9, 0 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v8, v9, v[10:11] +; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v6, v4, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v9, v[10:11] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_mul_i96: @@ -784,16 +785,16 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mul_lo_u32 v0, v6, v5 -; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v4, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v3, 0 +; GFX12-NEXT: v_mov_b32_e32 v8, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9] -; GFX12-NEXT: v_mov_b32_e32 v2, v8 +; GFX12-NEXT: v_mul_lo_u32 v0, v6, v5 +; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v7, v4, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v8, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v2, v8, v[9:10] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v6, v4, v[1:2] -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[8:9] +; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v6, v4, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v8, v[9:10] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_mul_i96: @@ -1072,18 +1073,17 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX7-NEXT: v_mov_b32_e32 v9, v1 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX7-NEXT: v_mov_b32_e32 v10, v2 -; GFX7-NEXT: v_mov_b32_e32 v11, v3 -; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v12, v4 +; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v5, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v11, v3 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0 -; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3] -; GFX7-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX7-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX7-NEXT: v_mov_b32_e32 v2, v13 -; GFX7-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4] -; GFX7-NEXT: v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v12, v[13:14] +; GFX7-NEXT: v_mul_lo_u32 v4, v9, v6 +; GFX7-NEXT: v_mul_lo_u32 v6, v8, v7 +; GFX7-NEXT: v_mad_u64_u32 v[13:14], vcc, v8, v5, v[1:2] +; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[13:14] +; GFX7-NEXT: v_addc_u32_e64 v3, s[4:5], v3, v6, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GFX7-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4] ; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7] ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1095,18 +1095,17 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, v2 -; GFX8-NEXT: v_mov_b32_e32 v11, v3 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v12, v4 +; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v5, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v11, v3 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0 -; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3] -; GFX8-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX8-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX8-NEXT: v_mov_b32_e32 v2, v13 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4] -; GFX8-NEXT: v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v12, v[13:14] +; GFX8-NEXT: v_mul_lo_u32 v4, v9, v6 +; GFX8-NEXT: v_mul_lo_u32 v6, v8, v7 +; GFX8-NEXT: v_mad_u64_u32 v[13:14], vcc, v8, v5, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[13:14] +; GFX8-NEXT: v_addc_u32_e64 v3, s[4:5], v3, v6, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4] ; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1118,18 +1117,17 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX9-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v10, v2 -; GFX9-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v5, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v11, v3 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0 -; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3] -; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX9-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX9-NEXT: v_mov_b32_e32 v2, v13 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4] -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], v14, v7, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v12, v[13:14] +; GFX9-NEXT: v_mul_lo_u32 v4, v9, v6 +; GFX9-NEXT: v_mul_lo_u32 v6, v8, v7 +; GFX9-NEXT: v_mad_u64_u32 v[13:14], vcc, v8, v5, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[13:14] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], v3, v6, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4] ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1140,19 +1138,19 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX10-NEXT: v_mov_b32_e32 v8, v0 ; GFX10-NEXT: v_mov_b32_e32 v9, v1 ; GFX10-NEXT: v_mov_b32_e32 v10, v2 +; GFX10-NEXT: v_mov_b32_e32 v11, v3 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v6, 0 ; GFX10-NEXT: v_mul_lo_u32 v7, v8, v7 ; GFX10-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[12:13], s4, v9, v5, v[0:1] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0 -; GFX10-NEXT: v_mad_u64_u32 v[13:14], s4, v10, v4, v[11:12] -; GFX10-NEXT: v_mov_b32_e32 v2, v13 -; GFX10-NEXT: v_mad_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[11:12] -; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v14, v7, s4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v10, v5, v[6:7] -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v4, v[5:6] +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v10, v4, v[12:13] +; GFX10-NEXT: v_mad_u64_u32 v[12:13], vcc_lo, v8, v5, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[12:13] +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s4, v3, v7, s4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v10, v5, v[3:4] +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v11, v4, v[5:6] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_mul_i128: @@ -1162,16 +1160,15 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX11-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v4 ; GFX11-NEXT: v_mov_b32_e32 v12, v3 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v6, 0 -; GFX11-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX11-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v9, v5, v[0:1] +; GFX11-NEXT: v_mul_lo_u32 v4, v9, v6 +; GFX11-NEXT: v_mul_lo_u32 v6, v8, v7 +; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v9, v5, v[0:1] ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v11, 0 -; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v10, v11, v[2:3] -; GFX11-NEXT: v_mov_b32_e32 v2, v13 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], vcc_lo, v8, v5, v[1:2] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[3:4] -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v14, v7, s0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v10, v11, v[13:14] +; GFX11-NEXT: v_mad_u64_u32 v[13:14], vcc_lo, v8, v5, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[13:14] +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v4, vcc_lo ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v10, v5, v[3:4] ; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v12, v11, v[6:7] ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1184,28 +1181,26 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1 -; GFX12-NEXT: v_mov_b32_e32 v10, v2 +; GFX12-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v6, 0 ; GFX12-NEXT: v_mul_lo_u32 v7, v8, v7 ; GFX12-NEXT: v_mul_lo_u32 v6, v9, v6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v9, v5, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], null, v9, v5, v[0:1] ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[13:14], null, v10, v4, v[11:12] +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v10, v4, v[12:13] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v2, v13 -; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[11:12] +; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[12:13] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v14, v7, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s0 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, v7, v6, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v4, v[5:6] +; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[3:4] +; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v11, v4, v[5:6] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_mul_i128: @@ -2409,216 +2404,204 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX7-NEXT: v_mul_lo_u32 v28, v3, v12 -; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX7-NEXT: v_mul_lo_u32 v29, v3, v12 +; GFX7-NEXT: v_mul_lo_u32 v30, v2, v13 ; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23] -; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23] -; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21] -; GFX7-NEXT: v_addc_u32_e32 v26, vcc, 0, v24, vcc -; GFX7-NEXT: v_mov_b32_e32 v21, v22 -; GFX7-NEXT: v_mov_b32_e32 v22, v23 -; GFX7-NEXT: v_mov_b32_e32 v23, v18 -; GFX7-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23] -; GFX7-NEXT: v_mul_lo_u32 v18, v6, v9 -; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17] -; GFX7-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25] -; GFX7-NEXT: v_mov_b32_e32 v20, v23 -; GFX7-NEXT: v_mul_lo_u32 v25, v4, v11 -; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21] -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24] -; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17] -; GFX7-NEXT: v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12] -; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX7-NEXT: v_mov_b32_e32 v12, v22 -; GFX7-NEXT: v_mul_lo_u32 v2, v2, v13 -; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12] -; GFX7-NEXT: v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21] -; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17] -; GFX7-NEXT: v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11] -; GFX7-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23] -; GFX7-NEXT: v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v9, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v4, v10, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v12, 0 +; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v5, v9, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[24:25], s[6:7], v1, v11, v[20:21] +; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[16:17] +; GFX7-NEXT: v_addc_u32_e32 v28, vcc, 0, v20, vcc +; GFX7-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[24:25] +; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GFX7-NEXT: v_mad_u64_u32 v[26:27], s[8:9], v6, v8, v[22:23] +; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v3, v9, v[16:17] +; GFX7-NEXT: v_addc_u32_e32 v16, vcc, 0, v20, vcc +; GFX7-NEXT: v_mad_u64_u32 v[20:21], vcc, v4, v8, v[22:23] +; GFX7-NEXT: v_mov_b32_e32 v22, v26 +; GFX7-NEXT: v_addc_u32_e32 v23, vcc, 0, v16, vcc +; GFX7-NEXT: v_mad_u64_u32 v[16:17], vcc, v0, v13, v[21:22] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v0, v11, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v1, v12, v[16:17] +; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11 +; GFX7-NEXT: v_mul_lo_u32 v25, v5, v10 +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[19:20] +; GFX7-NEXT: v_mul_lo_u32 v24, v6, v9 +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v4, v9, v[11:12] +; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v10, v[21:22] +; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[11:12] +; GFX7-NEXT: v_mul_lo_u32 v10, v1, v14 +; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[14:15], v5, v8, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0 +; GFX7-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v3, v8, v[19:20] +; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], 0, v2, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v0, v9, v[17:18] +; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] ; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX7-NEXT: v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7] -; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v18, vcc -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, v10 -; GFX7-NEXT: v_mov_b32_e32 v1, v13 -; GFX7-NEXT: v_mov_b32_e32 v2, v14 -; GFX7-NEXT: v_mov_b32_e32 v7, v11 +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v8, v[2:3] +; GFX7-NEXT: v_addc_u32_e64 v3, s[12:13], v4, v21, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], v28, v22, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], v5, v13, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v6, s[12:13], v23, v14, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v27, v0, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v10, s[14:15] +; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v30, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v29, s[8:9] +; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7] +; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v24, vcc +; GFX7-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, v16 +; GFX7-NEXT: v_mov_b32_e32 v1, v11 +; GFX7-NEXT: v_mov_b32_e32 v2, v12 +; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i256: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX8-NEXT: v_mul_lo_u32 v28, v3, v12 -; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX8-NEXT: v_mul_lo_u32 v29, v3, v12 +; GFX8-NEXT: v_mul_lo_u32 v30, v2, v13 ; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23] -; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23] -; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21] -; GFX8-NEXT: v_addc_u32_e32 v26, vcc, 0, v24, vcc -; GFX8-NEXT: v_mov_b32_e32 v21, v22 -; GFX8-NEXT: v_mov_b32_e32 v22, v23 -; GFX8-NEXT: v_mov_b32_e32 v23, v18 -; GFX8-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23] -; GFX8-NEXT: v_mul_lo_u32 v18, v6, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17] -; GFX8-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25] -; GFX8-NEXT: v_mov_b32_e32 v20, v23 -; GFX8-NEXT: v_mul_lo_u32 v25, v4, v11 -; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21] -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24] -; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17] -; GFX8-NEXT: v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12] -; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX8-NEXT: v_mov_b32_e32 v12, v22 -; GFX8-NEXT: v_mul_lo_u32 v2, v2, v13 -; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12] -; GFX8-NEXT: v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21] -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17] -; GFX8-NEXT: v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11] -; GFX8-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23] -; GFX8-NEXT: v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v9, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v4, v10, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v12, 0 +; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v5, v9, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[24:25], s[6:7], v1, v11, v[20:21] +; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[16:17] +; GFX8-NEXT: v_addc_u32_e32 v28, vcc, 0, v20, vcc +; GFX8-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GFX8-NEXT: v_mad_u64_u32 v[26:27], s[8:9], v6, v8, v[22:23] +; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v3, v9, v[16:17] +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v20, vcc +; GFX8-NEXT: v_mad_u64_u32 v[20:21], vcc, v4, v8, v[22:23] +; GFX8-NEXT: v_mov_b32_e32 v22, v26 +; GFX8-NEXT: v_addc_u32_e32 v23, vcc, 0, v16, vcc +; GFX8-NEXT: v_mad_u64_u32 v[16:17], vcc, v0, v13, v[21:22] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v0, v11, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v1, v12, v[16:17] +; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11 +; GFX8-NEXT: v_mul_lo_u32 v25, v5, v10 +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[19:20] +; GFX8-NEXT: v_mul_lo_u32 v24, v6, v9 +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v4, v9, v[11:12] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v10, v[21:22] +; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[11:12] +; GFX8-NEXT: v_mul_lo_u32 v10, v1, v14 +; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[14:15], v5, v8, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0 +; GFX8-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v3, v8, v[19:20] +; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], 0, v2, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v0, v9, v[17:18] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] ; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX8-NEXT: v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7] -; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v18, vcc -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, v10 -; GFX8-NEXT: v_mov_b32_e32 v1, v13 -; GFX8-NEXT: v_mov_b32_e32 v2, v14 -; GFX8-NEXT: v_mov_b32_e32 v7, v11 +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v8, v[2:3] +; GFX8-NEXT: v_addc_u32_e64 v3, s[12:13], v4, v21, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], v28, v22, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], v5, v13, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v6, s[12:13], v23, v14, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v27, v0, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v10, s[14:15] +; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v30, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v29, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v24, vcc +; GFX8-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, v16 +; GFX8-NEXT: v_mov_b32_e32 v1, v11 +; GFX8-NEXT: v_mov_b32_e32 v2, v12 +; GFX8-NEXT: v_mov_b32_e32 v7, v9 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i256: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX9-NEXT: v_mul_lo_u32 v28, v3, v12 -; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX9-NEXT: v_mul_lo_u32 v29, v3, v12 +; GFX9-NEXT: v_mul_lo_u32 v30, v2, v13 ; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23] -; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v24, vcc -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23] -; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v24, vcc -; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e32 v26, vcc, 0, v24, vcc -; GFX9-NEXT: v_mov_b32_e32 v21, v22 -; GFX9-NEXT: v_mov_b32_e32 v22, v23 -; GFX9-NEXT: v_mov_b32_e32 v23, v18 -; GFX9-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23] -; GFX9-NEXT: v_mul_lo_u32 v18, v6, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, v6, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25] -; GFX9-NEXT: v_mov_b32_e32 v20, v23 -; GFX9-NEXT: v_mul_lo_u32 v25, v4, v11 -; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21] -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24] -; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17] -; GFX9-NEXT: v_addc_co_u32_e64 v24, s[10:11], 0, v23, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12] -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v12, v22 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, v13 -; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12] -; GFX9-NEXT: v_addc_co_u32_e64 v13, s[10:11], 0, v24, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21] -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17] -; GFX9-NEXT: v_addc_co_u32_e64 v16, s[10:11], 0, v13, s[10:11] -; GFX9-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23] -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v4, s[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v9, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v4, v10, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v5, v9, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[24:25], s[6:7], v1, v11, v[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[16:17] +; GFX9-NEXT: v_addc_co_u32_e32 v28, vcc, 0, v20, vcc +; GFX9-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[24:25] +; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc +; GFX9-NEXT: v_mad_u64_u32 v[26:27], s[8:9], v6, v8, v[22:23] +; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v3, v9, v[16:17] +; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v20, vcc +; GFX9-NEXT: v_mad_u64_u32 v[20:21], vcc, v4, v8, v[22:23] +; GFX9-NEXT: v_mov_b32_e32 v22, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v16, vcc +; GFX9-NEXT: v_mad_u64_u32 v[16:17], vcc, v0, v13, v[21:22] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v0, v11, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v1, v12, v[16:17] +; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11 +; GFX9-NEXT: v_mul_lo_u32 v25, v5, v10 +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[19:20] +; GFX9-NEXT: v_mul_lo_u32 v24, v6, v9 +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v4, v9, v[11:12] +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v10, v[21:22] +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], 0, v4, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[11:12] +; GFX9-NEXT: v_mul_lo_u32 v10, v1, v14 +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[14:15], v5, v8, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0 +; GFX9-NEXT: v_addc_co_u32_e64 v2, s[12:13], 0, v4, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v3, v8, v[19:20] +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], 0, v2, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v0, v9, v[17:18] +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[10:11], v6, v5, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[10:11], v16, v11, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[10:11], v26, v12, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v19, v0, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v2, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v28, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v25, s[6:7] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v18, vcc -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, v10 -; GFX9-NEXT: v_mov_b32_e32 v1, v13 -; GFX9-NEXT: v_mov_b32_e32 v2, v14 -; GFX9-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v8, v[2:3] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[12:13], v4, v21, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], v28, v22, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v13, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[12:13], v23, v14, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v27, v0, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v0, v10, s[14:15] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v30, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v29, s[8:9] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v26, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v25, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v24, vcc +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, v16 +; GFX9-NEXT: v_mov_b32_e32 v1, v11 +; GFX9-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-NEXT: v_mov_b32_e32 v7, v9 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i256: @@ -2626,69 +2609,67 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v16, v0 ; GFX10-NEXT: v_mov_b32_e32 v17, v1 -; GFX10-NEXT: v_mul_lo_u32 v29, v4, v11 -; GFX10-NEXT: v_mul_lo_u32 v31, v3, v12 -; GFX10-NEXT: v_mul_lo_u32 v30, v2, v13 +; GFX10-NEXT: v_mov_b32_e32 v18, v2 +; GFX10-NEXT: v_mov_b32_e32 v19, v3 +; GFX10-NEXT: v_mul_lo_u32 v27, v6, v9 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v14, 0 -; GFX10-NEXT: v_mul_lo_u32 v28, v17, v14 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v13, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v12, 0 -; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v2, v12, v[18:19] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[0:1] +; GFX10-NEXT: v_mul_lo_u32 v30, v4, v11 +; GFX10-NEXT: v_mul_lo_u32 v28, v5, v10 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v17, v13, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v18, v12, v[2:3] +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v16, v12, 0 +; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v19, v11, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v11, v[2:3] ; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s4 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[20:21] -; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19] +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s5, v4, v10, v[20:21] +; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v18, v10, v[0:1] ; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v22, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v4, v10, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] -; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v22, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v5, v9, v[18:19] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v10, 0 -; GFX10-NEXT: v_mad_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1] -; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v26, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[24:25], s4, v6, v8, v[20:21] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v18, v23 -; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4 -; GFX10-NEXT: v_mul_lo_u32 v23, v6, v9 -; GFX10-NEXT: v_mov_b32_e32 v19, v24 -; GFX10-NEXT: v_mul_lo_u32 v24, v5, v10 -; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1] -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v27, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v19, v22 -; GFX10-NEXT: v_mul_lo_u32 v27, v16, v15 -; GFX10-NEXT: v_mov_b32_e32 v18, v21 -; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[18:19] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[2:3] +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v16, v10, 0 +; GFX10-NEXT: v_mad_u64_u32 v[23:24], vcc_lo, v19, v9, v[20:21] +; GFX10-NEXT: v_mad_u64_u32 v[25:26], s4, v6, v8, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[2:3] +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v22, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v4, v8, v[23:24] +; GFX10-NEXT: v_mov_b32_e32 v23, v25 +; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v18, v8, v[0:1] +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v16, v13, v[22:23] +; GFX10-NEXT: v_add_co_ci_u32_e64 v29, s4, 0, v20, s4 +; GFX10-NEXT: v_mov_b32_e32 v20, v3 +; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v17, v12, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[24:25], s6, v16, v11, v[20:21] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v16, v8, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v32, 0, 1, s6 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[21:22] -; GFX10-NEXT: v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15] -; GFX10-NEXT: v_mov_b32_e32 v13, v1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v32, s6 -; GFX10-NEXT: v_mov_b32_e32 v14, v20 -; GFX10-NEXT: v_mad_u64_u32 v[21:22], s7, v3, v10, v[18:19] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s6, v2, v9, v[11:12] -; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s6, 0, v1, s6 -; GFX10-NEXT: v_mad_u64_u32 v[10:11], s8, v16, v9, v[13:14] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v4, v9, v[21:22] -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s8 -; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v3, v8, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, 0, v15, s8 -; GFX10-NEXT: v_mad_u64_u32 v[14:15], s8, v5, v8, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[10:11] -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v4, v12, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v6, v13, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v9, v14, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v15, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v25, v27, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v28, s8 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v30, s6 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v31, s7 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v29, s5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v24, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v23, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 +; GFX10-NEXT: v_mad_u64_u32 v[20:21], s5, v18, v11, v[22:23] +; GFX10-NEXT: v_mul_lo_u32 v22, v16, v15 +; GFX10-NEXT: v_mul_lo_u32 v23, v17, v14 +; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v17, v10, v[24:25] +; GFX10-NEXT: v_mul_lo_u32 v24, v19, v12 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s6, 0, v3, s6 +; GFX10-NEXT: v_mad_u64_u32 v[11:12], s7, v19, v10, v[20:21] +; GFX10-NEXT: v_mul_lo_u32 v25, v18, v13 +; GFX10-NEXT: v_mad_u64_u32 v[20:21], s6, v18, v9, v[14:15] +; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s6, 0, v3, s6 +; GFX10-NEXT: v_mad_u64_u32 v[13:14], s6, v4, v9, v[11:12] +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s8, v16, v9, v[1:2] +; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s8 +; GFX10-NEXT: v_mad_u64_u32 v[9:10], s8, v19, v8, v[20:21] +; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s8, 0, v15, s8 +; GFX10-NEXT: v_mad_u64_u32 v[11:12], s8, v5, v8, v[13:14] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[3:4] +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v16, v9, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v29, v10, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v15, v11, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v6, v12, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v26, v22, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v23, s8 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v25, s6 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v24, s7 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v30, s5 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v28, s4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v27, vcc_lo ; GFX10-NEXT: v_mad_u64_u32 v[7:8], s4, v7, v8, v[9:10] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2696,69 +2677,68 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 -; GFX11-NEXT: v_dual_mov_b32 v18, v8 :: v_dual_mov_b32 v19, v7 -; GFX11-NEXT: v_mul_lo_u32 v30, v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v19, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v8 :: v_dual_mov_b32 v21, v7 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v14, 0 -; GFX11-NEXT: v_mul_lo_u32 v28, v16, v15 -; GFX11-NEXT: v_mul_lo_u32 v29, v17, v14 -; GFX11-NEXT: v_mul_lo_u32 v32, v3, v12 -; GFX11-NEXT: v_mul_lo_u32 v31, v2, v13 -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v17, v13, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v12, 0 -; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v2, v12, v[7:8] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], s0, v17, v11, v[0:1] +; GFX11-NEXT: v_mul_lo_u32 v27, v6, v9 +; GFX11-NEXT: v_mul_lo_u32 v30, v4, v11 +; GFX11-NEXT: v_mul_lo_u32 v31, v17, v14 +; GFX11-NEXT: v_mul_lo_u32 v28, v5, v10 +; GFX11-NEXT: v_mul_lo_u32 v15, v16, v15 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v17, v13, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v18, v12, v[2:3] +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v16, v12, 0 +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v19, v11, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v11, v[2:3] ; GFX11-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[20:21] -; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[7:8] +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v4, v10, v[7:8] +; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v18, v10, v[0:1] ; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v4, v10, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] -; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v5, v9, v[7:8] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v16, v10, 0 -; GFX11-NEXT: v_mad_u64_u32 v[22:23], vcc_lo, v4, v18, v[0:1] -; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[24:25], null, v6, v18, v[20:21] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[7:8] -; GFX11-NEXT: v_mov_b32_e32 v7, v23 -; GFX11-NEXT: v_cndmask_b32_e64 v27, 0, 1, s0 -; GFX11-NEXT: v_mul_lo_u32 v23, v6, v9 -; GFX11-NEXT: v_mov_b32_e32 v8, v24 -; GFX11-NEXT: v_mul_lo_u32 v24, v5, v10 -; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v18, v[0:1] -; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[7:8] -; GFX11-NEXT: v_dual_mov_b32 v7, v22 :: v_dual_mov_b32 v6, v21 -; GFX11-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[14:15], s2, v16, v11, v[6:7] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v18, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s2 -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s1, v2, v11, v[21:22] -; GFX11-NEXT: v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15] -; GFX11-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v8, s2 -; GFX11-NEXT: v_mad_u64_u32 v[21:22], s3, v3, v10, v[6:7] -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v2, v9, v[11:12] -; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v1, s2 -; GFX11-NEXT: v_mad_u64_u32 v[10:11], s4, v16, v9, v[13:14] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v9, v[21:22] -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 -; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v3, v18, v[6:7] -; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v12, s4 -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s4, v5, v18, v[1:2] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v18, v[10:11] -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v4, v8, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v27, v9, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v12, v6, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v7, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v25, v28, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v29, s4 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v31, s2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v32, s3 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[2:3] +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v16, v10, 0 +; GFX11-NEXT: v_mad_u64_u32 v[25:26], vcc_lo, v19, v9, v[7:8] +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v6, v20, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[2:3] +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v22, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[23:24], vcc_lo, v4, v20, v[25:26] +; GFX11-NEXT: v_mov_b32_e32 v25, v7 +; GFX11-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], s0, v18, v20, v[0:1] +; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v6, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v16, v13, v[24:25] +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v22, s0 +; GFX11-NEXT: v_mov_b32_e32 v22, v3 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], s0, v17, v12, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[24:25], s2, v16, v11, v[22:23] +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v20, 0 +; GFX11-NEXT: v_mad_u64_u32 v[22:23], s1, v18, v11, v[6:7] +; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v17, v10, v[24:25] +; GFX11-NEXT: v_mul_lo_u32 v24, v19, v12 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s2 +; GFX11-NEXT: v_mad_u64_u32 v[11:12], s3, v19, v10, v[22:23] +; GFX11-NEXT: v_mul_lo_u32 v22, v18, v13 +; GFX11-NEXT: v_mad_u64_u32 v[13:14], s2, v18, v9, v[6:7] +; GFX11-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v3, s2 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v4, v9, v[11:12] +; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v16, v9, v[1:2] +; GFX11-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 +; GFX11-NEXT: v_mad_u64_u32 v[9:10], s4, v19, v20, v[13:14] +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v18, s4 +; GFX11-NEXT: v_mad_u64_u32 v[11:12], s4, v5, v20, v[6:7] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v20, v[3:4] +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v16, v9, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v10, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v13, v11, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v12, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v8, v15, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v31, s4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v22, s2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v24, s3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v30, s1 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v24, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v23, s0 -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v19, v18, v[9:10] +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v28, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v27, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v21, v20, v[9:10] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_mul_i256: @@ -2769,103 +2749,99 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 -; GFX12-NEXT: v_mul_lo_u32 v29, v4, v11 -; GFX12-NEXT: v_mul_lo_u32 v31, v3, v12 -; GFX12-NEXT: v_mul_lo_u32 v30, v2, v13 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v19, v3 +; GFX12-NEXT: v_mul_lo_u32 v27, v6, v9 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v14, 0 -; GFX12-NEXT: v_mul_lo_u32 v28, v17, v14 -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v17, v13, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v12, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v2, v12, v[18:19] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1] +; GFX12-NEXT: v_mul_lo_u32 v30, v4, v11 +; GFX12-NEXT: v_mul_lo_u32 v28, v5, v10 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v17, v13, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v18, v12, v[2:3] +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v16, v12, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v19, v11, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v11, v[2:3] ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[20:21] -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19] -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v4, v10, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v4, v10, v[20:21] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v18, v10, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v5, v9, v[18:19] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v10, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[2:3] +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v16, v10, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[23:24], vcc_lo, v19, v9, v[20:21] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[25:26], null, v6, v8, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], null, v6, v8, v[20:21] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19] +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v22, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-NEXT: v_mov_b32_e32 v18, v23 +; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v4, v8, v[23:24] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v27, 0, 1, s0 -; GFX12-NEXT: v_mul_lo_u32 v23, v6, v9 -; GFX12-NEXT: v_mov_b32_e32 v19, v24 -; GFX12-NEXT: v_mul_lo_u32 v24, v5, v10 -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v27, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[18:19] -; GFX12-NEXT: v_mov_b32_e32 v19, v22 -; GFX12-NEXT: v_mul_lo_u32 v27, v16, v15 -; GFX12-NEXT: v_mov_b32_e32 v18, v21 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[18:19] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0 +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, vcc_lo +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], s0, v18, v8, v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v23, v25 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v32, 0, 1, s2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[21:22] -; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20 +; GFX12-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v20, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v20, v3 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v16, v13, v[22:23] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], s2, v16, v11, v[20:21] +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], s0, v17, v12, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], s1, v18, v11, v[22:23] +; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15 +; GFX12-NEXT: v_mul_lo_u32 v23, v17, v14 +; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v10, v[24:25] +; GFX12-NEXT: v_mul_lo_u32 v24, v19, v12 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v32, s2 -; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], s3, v3, v10, v[18:19] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s2, v2, v9, v[11:12] +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s2 +; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s3, v19, v10, v[20:21] +; GFX12-NEXT: v_mul_lo_u32 v25, v18, v13 +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], s2, v18, v9, v[14:15] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, s2 -; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s4, v16, v9, v[13:14] +; GFX12-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v3, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v4, v9, v[21:22] +; GFX12-NEXT: v_mad_co_u64_u32 v[13:14], s2, v4, v9, v[11:12] +; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s4, v16, v9, v[1:2] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 -; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v3, v8, v[18:19] +; GFX12-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 +; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], s4, v19, v8, v[20:21] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v15, s4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s4, v5, v8, v[1:2] -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[10:11] +; GFX12-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, s4 +; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s4, v5, v8, v[13:14] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[3:4] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v4, v12, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v16, v9, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v6, v13, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v10, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v9, v14, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v15, v11, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v15, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v6, v12, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v25, v27, s5 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, s4 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v26, v22, s5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s2 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v31, s3 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v23, s4 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, s3 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v29, s1 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, s0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v23, s0 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v27, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3155,8 +3131,8 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0 ; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v4 -; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v6, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v6, v[5:6] ; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; @@ -3167,8 +3143,8 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0 ; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v4 -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v7, v6, v[5:6] ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; @@ -3179,8 +3155,8 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0 ; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v4 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4] -; GFX9-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v7, v6, v[5:6] ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm ; @@ -3200,8 +3176,8 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v4, 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v4 -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, 0x50, v6, v[3:4] -; GFX11-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v5, v3 +; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, 0x50, v6, v[5:6] ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 2843f72353db1..b7c84f1389197 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -31,102 +31,100 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_xor_b32_e32 v1, v3, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v2 ; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1 -; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 -; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 +; CHECK-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v6 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3 -; CHECK-NEXT: v_trunc_f32_e32 v8, v6 -; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v8 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v14, v8 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0 -; CHECK-NEXT: v_mov_b32_e32 v3, v7 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v3, v14, v6 -; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] -; CHECK-NEXT: v_mul_hi_u32 v7, v11, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v14, v6 -; CHECK-NEXT: v_mul_lo_u32 v8, v11, v9 -; CHECK-NEXT: v_mul_lo_u32 v10, v14, v9 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v11, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; CHECK-NEXT: v_trunc_f32_e32 v6, v6 +; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v6 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v3, 0 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v12, v[7:8] +; CHECK-NEXT: v_mul_lo_u32 v7, v12, v6 +; CHECK-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v3, v[8:9] +; CHECK-NEXT: v_mul_hi_u32 v8, v3, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 +; CHECK-NEXT: v_mul_lo_u32 v9, v3, v10 +; CHECK-NEXT: v_mul_lo_u32 v11, v12, v10 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v3, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_mul_hi_u32 v9, v12, v10 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_mul_hi_u32 v8, v14, v9 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v3 -; CHECK-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0 -; CHECK-NEXT: v_mov_b32_e32 v3, v7 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v12 -; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v12, vcc -; CHECK-NEXT: v_xor_b32_e32 v10, v3, v12 -; CHECK-NEXT: v_mul_lo_u32 v3, v14, v6 -; CHECK-NEXT: v_mul_lo_u32 v5, v11, v9 -; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12 -; CHECK-NEXT: v_mul_hi_u32 v4, v11, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v14, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v14, v9 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v9 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v7, vcc +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v3, 0 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v12, v[7:8] +; CHECK-NEXT: v_ashrrev_i32_e32 v13, 31, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; CHECK-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v3, v[8:9] +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v13, vcc +; CHECK-NEXT: v_xor_b32_e32 v9, v4, v13 +; CHECK-NEXT: v_mul_lo_u32 v4, v12, v6 +; CHECK-NEXT: v_mul_lo_u32 v7, v3, v10 +; CHECK-NEXT: v_xor_b32_e32 v11, v5, v13 +; CHECK-NEXT: v_mul_hi_u32 v5, v3, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v14, v9 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v12, v10 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, v3, v10 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v12, v10 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v11, v3 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v13, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, v10, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v10, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v13, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v12, v5, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v11, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v9, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, v9, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v11, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v13, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v11, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v10, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, v9, v4 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v7, v13, v4 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v7, v5 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5] -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6] -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v7, vcc -; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v7 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v4 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v10, 0 +; CHECK-NEXT: v_add_i32_e32 v12, vcc, v6, v5 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v12, v[4:5] +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v9, v3 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v10, v[5:6] +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v11, v7, vcc +; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v11, v7 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 ; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] @@ -136,8 +134,8 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 ; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v9 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v10 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 @@ -150,9 +148,9 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v12, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v13, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -218,10 +216,10 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: s_subb_u32 s5, 0, s11 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; CHECK-NEXT: v_trunc_f32_e32 v2, v1 -; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 +; CHECK-NEXT: v_trunc_f32_e32 v1, v1 +; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v1 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] ; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -292,11 +290,11 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v4, s13, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, s13, v1 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v2 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v3, v2 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v4, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 ; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] @@ -379,266 +377,260 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc ; GISEL-NEXT: v_xor_b32_e32 v10, v4, v8 -; GISEL-NEXT: v_xor_b32_e32 v4, v5, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v10 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 +; GISEL-NEXT: v_xor_b32_e32 v9, v5, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v9 ; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v10 -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v4, vcc -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5 -; GISEL-NEXT: v_trunc_f32_e32 v9, v9 -; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v9, vcc +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 -; GISEL-NEXT: v_mov_b32_e32 v5, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v9, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v9, v11 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] -; GISEL-NEXT: v_mul_hi_u32 v13, v16, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, 0 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v16, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12] +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; GISEL-NEXT: v_mul_hi_u32 v4, v16, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v9, v14 +; GISEL-NEXT: v_mul_lo_u32 v11, v16, v13 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v5 -; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_mov_b32_e32 v5, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[5:6] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] -; GISEL-NEXT: v_xor_b32_e32 v17, v0, v9 -; GISEL-NEXT: v_mul_lo_u32 v0, v19, v11 -; GISEL-NEXT: v_mul_lo_u32 v5, v16, v14 -; GISEL-NEXT: v_xor_b32_e32 v18, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, 0 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, v[5:6] +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v17, v0, v5 +; GISEL-NEXT: v_mul_lo_u32 v0, v16, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 +; GISEL-NEXT: v_xor_b32_e32 v18, v1, v5 +; GISEL-NEXT: v_mul_hi_u32 v1, v15, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v16, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v19, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v16, v14 +; GISEL-NEXT: v_mul_lo_u32 v1, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, v19, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v16, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, v18, v0 ; GISEL-NEXT: v_mul_lo_u32 v11, v17, v1 ; GISEL-NEXT: v_mul_hi_u32 v12, v17, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v12, v18, v1 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; GISEL-NEXT: v_mul_hi_u32 v11, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v1, v18, v1 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v0, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v1, v5 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v16, v[1:2] -; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v5 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v7, v1, v5 -; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v4, v0, v[12:13] -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v17, v11 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v18, v14 -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v12 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v18, v14, vcc -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, v11, v4, vcc -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v15, v11 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v15 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 -; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v7 -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v6, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v22, v15 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v10 -; GISEL-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v14, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v10 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v22, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v21, v4 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v18, v14 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v18, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v11, v22, v11 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v21, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v13, v15, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v15, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_mul_lo_u32 v12, v22, v14 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v18, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v14, v22, v14 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v18, v10 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v22, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v19, v12, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v16, v13, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v9, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v14, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v11, v1, v13 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v18, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v15, 0 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v11, v4 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v16, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v17, v0 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v15, v[11:12] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v18, v13, vcc +; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v18, v13 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v4, v9, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v15 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v4 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v7, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v7, v6, v4 +; GISEL-NEXT: v_xor_b32_e32 v6, v17, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v17, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v18, v6 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v9 +; GISEL-NEXT: v_mac_f32_e32 v17, 0x4f800000, v18 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v17 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v14, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v12 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v13, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v1, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 +; GISEL-NEXT: v_sub_i32_e64 v19, s[4:5], 0, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 +; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, v6, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v17, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v18, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v22, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v17, v[9:10] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v18, v0 ; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[8:9] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v15 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v15, vcc -; GISEL-NEXT: v_xor_b32_e32 v16, v1, v15 -; GISEL-NEXT: v_mul_lo_u32 v1, v14, v10 -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v17, v2, v15 -; GISEL-NEXT: v_mul_hi_u32 v2, v12, v10 -; GISEL-NEXT: v_mul_hi_u32 v8, v14, v10 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v21, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v0 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v9 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v17, v1 +; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v18, v0, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v12, 0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v14, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v10, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v15 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v20, v12, v[8:9] +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v15, vcc +; GISEL-NEXT: v_xor_b32_e32 v16, v2, v15 ; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GISEL-NEXT: v_mul_lo_u32 v8, v12, v10 +; GISEL-NEXT: v_xor_b32_e32 v17, v3, v15 ; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v17, v1 -; GISEL-NEXT: v_mul_lo_u32 v3, v16, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v4, v13 -; GISEL-NEXT: v_mul_hi_u32 v4, v16, v1 -; GISEL-NEXT: v_mul_hi_u32 v1, v17, v1 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v10 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v17, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v16, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v1, v2 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v16, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v16, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v10, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v1 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v11, v13 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v13, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v10, v[3:4] +; GISEL-NEXT: v_xor_b32_e32 v10, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_mul_hi_u32 v8, v17, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v8, v0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v13, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v5 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v12, v[8:9] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v16, v2 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v17, v8, vcc -; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v17, v8 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v17, v10, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v17, v10 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v6 -; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v6, vcc +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v6 -; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v9, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v12 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v8 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v8 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v15, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v15, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v5, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -667,28 +659,28 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v1, v3, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1 -; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v16, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 ; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; CGP-NEXT: v_trunc_f32_e32 v5, v4 -; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v17, v5 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5] -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13] -; CGP-NEXT: v_mul_lo_u32 v5, v17, v3 -; CGP-NEXT: v_mul_hi_u32 v12, v14, v3 -; CGP-NEXT: v_mul_lo_u32 v13, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v3, v17, v3 -; CGP-NEXT: v_mul_lo_u32 v18, v17, v4 +; CGP-NEXT: v_trunc_f32_e32 v4, v4 +; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v4 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v15, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, v[12:13] +; CGP-NEXT: v_mul_lo_u32 v5, v14, v3 +; CGP-NEXT: v_mul_hi_u32 v12, v15, v3 +; CGP-NEXT: v_mul_lo_u32 v13, v15, v4 +; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_mul_lo_u32 v18, v14, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v12, v15, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v18, v3 @@ -696,44 +688,44 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v14, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v3 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v17, v4, vcc -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v11 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13] -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v15 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v15, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v5, v15 -; CGP-NEXT: v_mul_lo_u32 v5, v17, v3 -; CGP-NEXT: v_mul_lo_u32 v11, v14, v4 -; CGP-NEXT: v_xor_b32_e32 v13, v10, v15 -; CGP-NEXT: v_mul_hi_u32 v10, v14, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v17, v3 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v3 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v4, vcc +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v15, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v16, 31, v11 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, v[12:13] +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v16 +; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v12, v5, v16 +; CGP-NEXT: v_mul_lo_u32 v5, v14, v3 +; CGP-NEXT: v_mul_lo_u32 v11, v15, v4 +; CGP-NEXT: v_xor_b32_e32 v13, v10, v16 +; CGP-NEXT: v_mul_hi_u32 v10, v15, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v17, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v14, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v11, v15, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v3 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v14, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v17, v4, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v15, v3 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 ; CGP-NEXT: v_mul_lo_u32 v10, v12, v4 ; CGP-NEXT: v_mul_hi_u32 v11, v12, v3 @@ -751,12 +743,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v13, v4 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v3 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v4 ; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v14, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v11, v5 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v16, v[4:5] +; CGP-NEXT: v_add_i32_e32 v15, vcc, v10, v5 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v15, v[4:5] ; CGP-NEXT: v_sub_i32_e32 v3, vcc, v12, v3 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[10:11] ; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v13, v4, vcc @@ -771,7 +763,7 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc ; CGP-NEXT: v_cndmask_b32_e64 v5, v10, v11, s[4:5] ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v14 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v16, vcc +; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 @@ -785,8 +777,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v15, v0 -; CGP-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v16, v0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v15, v2, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v1, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v2, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -840,28 +832,28 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v3, v5, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v5, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 -; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 -; CGP-NEXT: v_trunc_f32_e32 v7, v6 -; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7] -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11] -; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 -; CGP-NEXT: v_mul_hi_u32 v10, v12, v5 -; CGP-NEXT: v_mul_lo_u32 v11, v12, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 -; CGP-NEXT: v_mul_lo_u32 v16, v15, v6 +; CGP-NEXT: v_trunc_f32_e32 v6, v6 +; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v6 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v13, v[10:11] +; CGP-NEXT: v_mul_lo_u32 v7, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v5 +; CGP-NEXT: v_mul_lo_u32 v11, v13, v6 +; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 +; CGP-NEXT: v_mul_lo_u32 v16, v12, v6 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v12, v6 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v6 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5 @@ -869,53 +861,53 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v6, v15, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v12, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v5 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v6, vcc -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7] -; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11] -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v13 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v7, v13 -; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 -; CGP-NEXT: v_mul_lo_u32 v9, v12, v6 -; CGP-NEXT: v_xor_b32_e32 v14, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v8, v12, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v5 +; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v6, vcc +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7] +; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v13, v[10:11] +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v14 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v7, v14 +; CGP-NEXT: v_mul_lo_u32 v7, v12, v5 +; CGP-NEXT: v_mul_lo_u32 v9, v13, v6 +; CGP-NEXT: v_xor_b32_e32 v15, v8, v14 +; CGP-NEXT: v_mul_hi_u32 v8, v13, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v15, v6 +; CGP-NEXT: v_mul_lo_u32 v8, v12, v6 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CGP-NEXT: v_mul_hi_u32 v9, v12, v6 +; CGP-NEXT: v_mul_hi_u32 v9, v13, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v6, v15, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v12, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v15, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v14, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v12, v6, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 ; CGP-NEXT: v_mul_lo_u32 v8, v11, v6 ; CGP-NEXT: v_mul_hi_u32 v9, v11, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v14, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v14, v6 +; CGP-NEXT: v_mul_lo_u32 v9, v15, v6 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CGP-NEXT: v_mul_hi_u32 v8, v11, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 @@ -924,16 +916,16 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7 -; CGP-NEXT: v_mul_hi_u32 v9, v14, v6 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v8, v15, v6 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v15, v[6:7] +; CGP-NEXT: v_add_i32_e32 v13, vcc, v8, v7 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v13, v[6:7] ; CGP-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 ; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v12, v[7:8] -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v14, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v14, v9 +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v15, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v15, v9 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 ; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] @@ -944,7 +936,7 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc ; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v15, vcc +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 @@ -958,8 +950,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v13, v2 -; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v14, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v3, v5 ; CGP-NEXT: v_xor_b32_e32 v3, v4, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 @@ -1054,10 +1046,10 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v4, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 +; CHECK-NEXT: v_trunc_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v3 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 ; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] ; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 @@ -1133,11 +1125,11 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v5, v1 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v3, v2 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 ; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc @@ -1186,155 +1178,152 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v5 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11] -; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v14 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v7, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v7, v13 -; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15] +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15] +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v17, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v14, v18, v16 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v16 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v18, v16 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v17, v0, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v15 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v16 -; GISEL-NEXT: v_xor_b32_e32 v19, v1, v4 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v14 +; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v14, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 -; GISEL-NEXT: v_mov_b32_e32 v7, 0x12d8fb +; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v19, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v18, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v20, vcc, v15, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v7, v20, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v17, v0 -; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14] +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v1 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v1, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v20, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14] ; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v15, vcc ; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v15 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v18 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v20, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v7 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v17 +; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v20, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v19, -1, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16 -; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v17, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v19, -1, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1] +; GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v16 +; GISEL-NEXT: v_mul_lo_u32 v5, v7, v13 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v18, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v16, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v16, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v16, v18, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_mul_hi_u32 v1, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v20, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v12, v18, v14, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v12, v4 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9] -; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc -; GISEL-NEXT: v_xor_b32_e32 v8, v2, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v5, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v20, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; GISEL-NEXT: v_xor_b32_e32 v14, v2, v13 ; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 -; GISEL-NEXT: v_xor_b32_e32 v9, v3, v12 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 ; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1356,62 +1345,61 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v8, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 -; GISEL-NEXT: v_xor_b32_e32 v10, v13, v4 +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v14, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v12, v9 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v9, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v8, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v9, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v11, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v6, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v13, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v11, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v9, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v9, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v5, v15, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v5, v0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v7 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v15, v7, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v6, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_oddk_denom: @@ -1424,178 +1412,175 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v8, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_trunc_f32_e32 v5, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v5 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] -; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 -; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v12 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 -; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc -; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v19, v13 -; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v18, v16 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v18, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v8, v12, vcc +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14] +; CGP-NEXT: v_mul_lo_u32 v17, v16, v12 +; CGP-NEXT: v_mul_hi_u32 v18, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v19, v16, v12 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v13, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v4, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v19, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v19, v16 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v16 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v15, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v19, v16 -; CGP-NEXT: v_xor_b32_e32 v17, v1, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v16, v12 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_mul_hi_u32 v17, v16, v12 +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v18, v0, v12 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v13 +; CGP-NEXT: v_xor_b32_e32 v19, v1, v12 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v17, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 -; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v17, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v17, v1 +; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v18, v17, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v18, vcc, v18, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v18, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v17, v13, vcc -; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v17, v13 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_mul_hi_u32 v1, v19, v1 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v0, v13 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v1, v0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 +; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc +; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v16 -; CGP-NEXT: v_addc_u32_e32 v19, vcc, 0, v18, vcc +; CGP-NEXT: v_add_i32_e32 v18, vcc, 1, v15 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, 0, v16, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1] -; CGP-NEXT: v_cndmask_b32_e64 v20, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14] -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v20, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v17 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v19, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v14, v17, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v14, vcc -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v12, v18, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v13, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8] -; CGP-NEXT: v_xor_b32_e32 v1, v12, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v2, v12 -; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v10, v5 -; CGP-NEXT: v_xor_b32_e32 v8, v3, v12 -; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 +; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v14, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v18 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1] +; CGP-NEXT: v_addc_u32_e32 v21, vcc, 0, v19, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v7, v13 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; CGP-NEXT: v_cndmask_b32_e32 v0, v18, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v19, v21, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_cndmask_b32_e32 v14, v15, v0, vcc +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_mul_hi_u32 v1, v7, v13 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v7, v0 +; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v1, s[4:5] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 +; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2] +; CGP-NEXT: v_xor_b32_e32 v1, v5, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8] +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v2, v13 +; CGP-NEXT: v_mul_lo_u32 v2, v10, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v9, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v3, v13 +; CGP-NEXT: v_mul_hi_u32 v3, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v10, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v9, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v14, v12 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v12 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 @@ -1610,11 +1595,11 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v2 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v3 +; CGP-NEXT: v_mul_hi_u32 v6, v8, v2 ; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v6, v5 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 ; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc @@ -1640,10 +1625,10 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v13 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v13 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i64> %num, ret <2 x i64> %result @@ -1679,28 +1664,28 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_xor_b32_e32 v1, v5, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2 ; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1 -; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 -; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 +; CHECK-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 -; CHECK-NEXT: v_trunc_f32_e32 v7, v6 -; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v14, v7 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7] -; CHECK-NEXT: v_mul_lo_u32 v6, v14, v5 -; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] -; CHECK-NEXT: v_mul_hi_u32 v7, v11, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v14, v5 -; CHECK-NEXT: v_mul_lo_u32 v8, v11, v9 -; CHECK-NEXT: v_mul_lo_u32 v10, v14, v9 +; CHECK-NEXT: v_trunc_f32_e32 v6, v6 +; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v6 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v11, v[6:7] +; CHECK-NEXT: v_mul_lo_u32 v6, v11, v5 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v12, v[7:8] +; CHECK-NEXT: v_mul_hi_u32 v7, v12, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v12, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v11, v9 +; CHECK-NEXT: v_mul_hi_u32 v7, v12, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5 @@ -1708,53 +1693,53 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_mul_hi_u32 v8, v14, v9 +; CHECK-NEXT: v_mul_hi_u32 v8, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v5 -; CHECK-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7] -; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v12, vcc -; CHECK-NEXT: v_xor_b32_e32 v10, v3, v12 -; CHECK-NEXT: v_mul_lo_u32 v3, v14, v5 -; CHECK-NEXT: v_mul_lo_u32 v6, v11, v9 -; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12 -; CHECK-NEXT: v_mul_hi_u32 v4, v11, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v14, v5 +; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v5 +; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v6, vcc +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v11, v[6:7] +; CHECK-NEXT: v_ashrrev_i32_e32 v13, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v13 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v12, v[7:8] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v13, vcc +; CHECK-NEXT: v_xor_b32_e32 v10, v3, v13 +; CHECK-NEXT: v_mul_lo_u32 v3, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v6, v12, v9 +; CHECK-NEXT: v_xor_b32_e32 v14, v4, v13 +; CHECK-NEXT: v_mul_hi_u32 v4, v12, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v14, v9 +; CHECK-NEXT: v_mul_lo_u32 v4, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v9 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v9 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v14, v9 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v11, v3 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v13, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v12, v3 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v11, v4, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v14, v3 ; CHECK-NEXT: v_mul_lo_u32 v6, v10, v4 ; CHECK-NEXT: v_mul_hi_u32 v7, v10, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v13, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v14, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v13, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v14, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_mul_hi_u32 v6, v10, v4 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 @@ -1763,16 +1748,16 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v7, v13, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v14, v4 ; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v7, v5 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v6, v5 ; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5] ; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 ; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6] -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v7, vcc -; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v7 +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v7, vcc +; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v7 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 ; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] @@ -1797,7 +1782,7 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v12, v0 +; CHECK-NEXT: v_xor_b32_e32 v3, v13, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 @@ -1839,274 +1824,268 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-LABEL: v_sdiv_v2i64_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v9, 0x1000 -; GISEL-NEXT: v_mov_b32_e32 v10, 0 -; GISEL-NEXT: v_lshl_b64 v[7:8], v[9:10], v4 -; GISEL-NEXT: v_lshl_b64 v[9:10], v[9:10], v6 +; GISEL-NEXT: v_mov_b32_e32 v12, 0x1000 +; GISEL-NEXT: v_mov_b32_e32 v13, 0 +; GISEL-NEXT: v_lshl_b64 v[7:8], v[12:13], v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v4 ; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v4, vcc ; GISEL-NEXT: v_xor_b32_e32 v8, v5, v4 -; GISEL-NEXT: v_xor_b32_e32 v5, v7, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5 -; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v8 -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v5, vcc -; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v11 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v10 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7 -; GISEL-NEXT: v_trunc_f32_e32 v13, v11 -; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v13 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8] -; GISEL-NEXT: v_mul_lo_u32 v7, v19, v11 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] -; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v19, v14 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_mul_hi_u32 v13, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v19, v14 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v7 +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v8 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v7, vcc +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v5, 0 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v11, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 +; GISEL-NEXT: v_mul_hi_u32 v19, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v18, v5, v[14:15] +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v9 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v5, v9 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v11, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v5, 0 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v19, v[11:12] +; GISEL-NEXT: v_mul_lo_u32 v9, v19, v10 +; GISEL-NEXT: v_lshl_b64 v[12:13], v[12:13], v6 +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v5, v[14:15] +; GISEL-NEXT: v_mul_hi_u32 v14, v5, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v5, v16 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v7 -; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v19, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8] -; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7 -; GISEL-NEXT: v_mul_lo_u32 v0, v19, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 -; GISEL-NEXT: v_xor_b32_e32 v18, v1, v7 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v9 +; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v1, v9, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v14, v0, v9 +; GISEL-NEXT: v_xor_b32_e32 v15, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v19, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v14 +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v19, v16 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v16 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v19, v14 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v19, v16 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v18, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v17, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v5, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v14, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_xor_b32_e32 v4, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v18, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v17, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_mul_hi_u32 v1, v18, v1 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v0, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v16, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v6 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v10, v6, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, v1, v6 -; GISEL-NEXT: v_xor_b32_e32 v9, v9, v6 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v5, v0, v[12:13] -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v9 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v17, v11 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v18, v14 -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v12 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v18, v14, vcc -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, v11, v5, vcc -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v15, v11 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v15 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v16, 0 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v10, v5 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v17, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[10:11] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v15, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v7 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v7, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v16 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v17, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v7 +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v6, v10, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v5 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v13, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v10, v6, v5 +; GISEL-NEXT: v_xor_b32_e32 v6, v12, v5 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v7 +; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v13 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v14 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v15, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v1, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v14, v7, vcc +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v0 +; GISEL-NEXT: v_sub_i32_e64 v19, s[4:5], 0, v10 ; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 -; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v10 -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v22, v15 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v8 -; GISEL-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v14, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v8 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v22, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v21, v5 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v18, v14 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v18, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v11, v22, v11 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v21, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v15, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v8 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v15, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v8, v13, v17, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v22, v14 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_mul_hi_u32 v13, v18, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v14, v22, v14 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v11 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v22, v12, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v19, v13, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v14, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v12, v1, v7 -; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v15 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v15, vcc -; GISEL-NEXT: v_xor_b32_e32 v5, v1, v15 -; GISEL-NEXT: v_mul_lo_u32 v1, v14, v11 -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_xor_b32_e32 v16, v2, v15 -; GISEL-NEXT: v_mul_hi_u32 v2, v13, v11 -; GISEL-NEXT: v_mul_hi_u32 v4, v14, v11 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, v6, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v14, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v15, v12, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v19, v18, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v22, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v14, v[7:8] +; GISEL-NEXT: v_cndmask_b32_e32 v11, v16, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v18, v0 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v21, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v7, v1 +; GISEL-NEXT: v_mul_hi_u32 v7, v14, v0 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v7, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v7 +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v1 +; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v18, v0, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v13, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v19, v14, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v20, v13, v[7:8] +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v12, v2, v9 ; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v11 +; GISEL-NEXT: v_xor_b32_e32 v16, v3, v9 ; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1 -; GISEL-NEXT: v_mul_lo_u32 v3, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v5, v1 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 -; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v11 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v14, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v1, v2 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v11, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v13, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v7 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v9, v11, v[3:4] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v16, v7, vcc -; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v16, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v9 -; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v9, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v11, v15, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v16, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_mul_hi_u32 v7, v16, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v14, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v11, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v13, v[7:8] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v10 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 -; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v7, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v13 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v14, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v5 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v7, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v15, v6 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v7 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v13, v2, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v9, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -2138,28 +2117,28 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v1, v10, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v11, v1 -; CGP-NEXT: v_sub_i32_e32 v17, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v18, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v18, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v19, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 ; CGP-NEXT: v_rcp_iflag_f32_e32 v10, v10 ; CGP-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 ; CGP-NEXT: v_mul_f32_e32 v11, 0x2f800000, v10 -; CGP-NEXT: v_trunc_f32_e32 v12, v11 -; CGP-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 -; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v19, v12 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12] -; CGP-NEXT: v_mul_lo_u32 v11, v19, v10 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] -; CGP-NEXT: v_mul_hi_u32 v12, v16, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v19, v10 -; CGP-NEXT: v_mul_lo_u32 v13, v16, v14 -; CGP-NEXT: v_mul_lo_u32 v15, v19, v14 +; CGP-NEXT: v_trunc_f32_e32 v11, v11 +; CGP-NEXT: v_mac_f32_e32 v10, 0xcf800000, v11 +; CGP-NEXT: v_cvt_u32_f32_e32 v17, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v16, v11 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v18, v17, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v18, v16, v[11:12] +; CGP-NEXT: v_mul_lo_u32 v11, v16, v10 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v19, v17, v[12:13] +; CGP-NEXT: v_mul_hi_u32 v12, v17, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 +; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 +; CGP-NEXT: v_mul_lo_u32 v15, v16, v14 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v16, v14 +; CGP-NEXT: v_mul_hi_u32 v12, v17, v14 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 @@ -2167,53 +2146,53 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v13, v19, v14 +; CGP-NEXT: v_mul_hi_u32 v13, v16, v14 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v10 -; CGP-NEXT: v_addc_u32_e32 v19, vcc, v19, v11, vcc -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12] -; CGP-NEXT: v_ashrrev_i32_e32 v17, 31, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v17 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v17, vcc -; CGP-NEXT: v_xor_b32_e32 v15, v8, v17 -; CGP-NEXT: v_mul_lo_u32 v8, v19, v10 -; CGP-NEXT: v_mul_lo_u32 v11, v16, v14 -; CGP-NEXT: v_xor_b32_e32 v18, v9, v17 -; CGP-NEXT: v_mul_hi_u32 v9, v16, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v19, v10 +; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v10 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v18, v17, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v18, v16, v[11:12] +; CGP-NEXT: v_ashrrev_i32_e32 v18, 31, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v18 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v19, v17, v[12:13] +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v18, vcc +; CGP-NEXT: v_xor_b32_e32 v15, v8, v18 +; CGP-NEXT: v_mul_lo_u32 v8, v16, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v17, v14 +; CGP-NEXT: v_xor_b32_e32 v19, v9, v18 +; CGP-NEXT: v_mul_hi_u32 v9, v17, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v19, v14 +; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v11, v16, v14 +; CGP-NEXT: v_mul_hi_u32 v11, v17, v14 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v19, v14 +; CGP-NEXT: v_mul_hi_u32 v11, v16, v14 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v19, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v18, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v17, v8 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v16, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v19, v8 ; CGP-NEXT: v_mul_lo_u32 v11, v15, v9 ; CGP-NEXT: v_mul_hi_u32 v12, v15, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v18, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v19, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v18, v9 +; CGP-NEXT: v_mul_lo_u32 v12, v19, v9 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_mul_hi_u32 v11, v15, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 @@ -2222,16 +2201,16 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v8, v10 -; CGP-NEXT: v_mul_hi_u32 v12, v18, v9 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v19, v9 ; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v14, 0 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v11, v10 ; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v16, v[9:10] ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 ; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v1, v14, v[10:11] -; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v18, v12, vcc -; CGP-NEXT: v_sub_i32_e64 v10, s[4:5], v18, v12 +; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v19, v12, vcc +; CGP-NEXT: v_sub_i32_e64 v10, s[4:5], v19, v12 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1 ; CGP-NEXT: v_subb_u32_e32 v10, vcc, v10, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] @@ -2256,7 +2235,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v8, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v8, v17, v0 +; CGP-NEXT: v_xor_b32_e32 v8, v18, v0 ; CGP-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v1, v8 ; CGP-NEXT: v_xor_b32_e32 v1, v4, v8 @@ -2313,102 +2292,100 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v3, v6, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3 -; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8 ; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 -; CGP-NEXT: v_trunc_f32_e32 v10, v8 -; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7] -; CGP-NEXT: v_mul_lo_u32 v6, v16, v8 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v9, v13, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v16, v8 -; CGP-NEXT: v_mul_lo_u32 v10, v13, v11 -; CGP-NEXT: v_mul_lo_u32 v12, v16, v11 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v11 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 +; CGP-NEXT: v_trunc_f32_e32 v8, v8 +; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v8 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v15, v6, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v14, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v9, v14, v8 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v6, v[10:11] +; CGP-NEXT: v_mul_hi_u32 v10, v6, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v6, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v14, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v6, v12 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v11, v14, v12 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v11 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v6 -; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7] -; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v14, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v5, v14 -; CGP-NEXT: v_mul_lo_u32 v5, v16, v8 -; CGP-NEXT: v_mul_lo_u32 v7, v13, v11 -; CGP-NEXT: v_xor_b32_e32 v15, v6, v14 -; CGP-NEXT: v_mul_hi_u32 v6, v13, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v16, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v9, vcc +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v15, v6, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v14, v[9:10] +; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v15 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v6, v[10:11] +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v15, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v5, v15 +; CGP-NEXT: v_mul_lo_u32 v5, v14, v8 +; CGP-NEXT: v_mul_lo_u32 v9, v6, v12 +; CGP-NEXT: v_xor_b32_e32 v13, v7, v15 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v16, v11 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v13, v11 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CGP-NEXT: v_mul_lo_u32 v7, v14, v12 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v9, v6, v12 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v14, v12 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v16, v11 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v16, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 -; CGP-NEXT: v_mul_lo_u32 v8, v12, v6 -; CGP-NEXT: v_mul_hi_u32 v9, v12, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v14, v7, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v11, v6 +; CGP-NEXT: v_mul_hi_u32 v9, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v15, v6 +; CGP-NEXT: v_mul_lo_u32 v9, v13, v6 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v12, v6 +; CGP-NEXT: v_mul_hi_u32 v8, v11, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v5, v7 -; CGP-NEXT: v_mul_hi_u32 v9, v15, v6 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, 0 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v9, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v13, v[6:7] -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v11, v[7:8] -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v15, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v15, v9 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v8, v13, v6 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v8, v7 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v14, v[6:7] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v12, v[7:8] +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v13, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v13, v9 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 ; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] @@ -2418,8 +2395,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 ; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc ; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5] -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v11 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v14, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 @@ -2432,9 +2409,9 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CGP-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v14, v2 -; CGP-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v15, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v14, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v3, v5 ; CGP-NEXT: v_xor_b32_e32 v3, v4, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 @@ -2538,28 +2515,29 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v10, 0 -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 ; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v10 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GISEL-NEXT: v_trunc_f32_e32 v5, v4 -; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[7:8] -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v9, v3 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 -; GISEL-NEXT: v_mul_lo_u32 v14, v13, v4 +; GISEL-NEXT: v_trunc_f32_e32 v4, v4 +; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v3 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v9, v3 +; GISEL-NEXT: v_mul_lo_u32 v14, v9, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 @@ -2567,168 +2545,163 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v3 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] -; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[7:8] -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 -; GISEL-NEXT: v_mul_lo_u32 v7, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v3 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[4:5] +; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v9, v3 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v7, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v9, v3, vcc ; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v12, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 -; GISEL-NEXT: v_and_b32_e32 v13, 0xffffff, v2 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GISEL-NEXT: v_mul_lo_u32 v5, 0, v3 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v3 -; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 ; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v3 -; GISEL-NEXT: v_mov_b32_e32 v5, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, v[5:6] -; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v10 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v12 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[8:9] -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v7 -; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v2 -; GISEL-NEXT: v_trunc_f32_e32 v8, v6 -; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v2 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, v5, vcc +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[8:9] +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v10 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v0, v[5:6] +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v11 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v12, v7 +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, v8, vcc +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 ; GISEL-NEXT: v_sub_i32_e64 v15, s[4:5], 0, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v6 ; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v8 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5 -; GISEL-NEXT: v_mov_b32_e32 v2, v7 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v17, v[2:3] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0 +; GISEL-NEXT: v_sub_i32_e64 v17, s[4:5], 0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v14, v[6:7] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v14, v[7:8] +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[7:8] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v2, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v2, v17, v6 -; GISEL-NEXT: v_mul_lo_u32 v7, v14, v9 -; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_hi_u32 v5, v14, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v12, -1, v6, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v6, v14, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v10, v13, v5 +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v17, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v17, v9 -; GISEL-NEXT: v_mul_hi_u32 v6, v17, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v14, v9 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v17, v9 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v2 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v17, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v5 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0 ; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v1 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, v[2:3] ; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[6:7] -; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v14, v[6:7] +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[7:8] ; GISEL-NEXT: v_mul_lo_u32 v6, v14, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v8 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v4, vcc -; GISEL-NEXT: v_mul_hi_u32 v9, v12, v5 +; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v5 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 ; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v14, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v14, v9 ; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v14, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v9 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v14, v6, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v13, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v2 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v14, v6, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, 0, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v2, v6 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v5 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v15 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v18, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v6 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v13, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v5, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v12, 0 -; GISEL-NEXT: v_mul_hi_u32 v14, 0, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, 0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v11, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc -; GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v14, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v11, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v12, v[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v5 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v8 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v3, v13, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v9, v18, v10, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], 0, v11, v[7:8] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v9 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v9, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 @@ -2736,8 +2709,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v12 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v14, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 @@ -2748,8 +2721,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_24bit: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index f4489c2239fda..9d6ffc9bbc0dc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -172,11 +172,11 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: s_subb_u32 s15, 0, s9 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX8-NEXT: v_trunc_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -247,11 +247,11 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v4, s11, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, s11, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v3, v2 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2] ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s10, v0 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] @@ -333,11 +333,11 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: s_subb_u32 s11, 0, s7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -400,20 +400,20 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v6, 0 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v7, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s6, v8, v[1:2] ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s8, v0 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s7, v6, v[2:3] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s7, v7, v[2:3] ; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v1, v4, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v1 ; GFX9-NEXT: v_sub_u32_e32 v2, s9, v4 @@ -421,19 +421,19 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] ; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v0 ; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v2, vcc -; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v6 +; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v7 ; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v8, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v5 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 ; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc @@ -442,15 +442,15 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v11, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v11, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v4, v0, v4, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] ; GFX9-NEXT: v_xor_b32_e32 v0, s0, v3 -; GFX9-NEXT: v_xor_b32_e32 v1, s1, v6 +; GFX9-NEXT: v_xor_b32_e32 v1, s1, v7 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc @@ -554,29 +554,29 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8 ; GFX10-NEXT: v_add_co_u32 v0, s8, v5, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s8 +; GFX10-NEXT: v_mul_hi_u32 v5, s1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2 ; GFX10-NEXT: v_add_co_u32 v0, s8, v0, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s8 -; GFX10-NEXT: v_add_co_u32 v5, s8, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, s1, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s8 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s8, s6, v5, 0 -; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2 +; GFX10-NEXT: v_add_co_u32 v6, s8, v0, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s8 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s8, s6, v6, 0 +; GFX10-NEXT: v_add3_u32 v3, v2, v3, v5 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s6, v3, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s7, v5, v[1:2] -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s7, v6, v[1:2] +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v6, 1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s1, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, s1, v1 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v1, s0, s1, v1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s7, v5, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v6, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v5, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v1 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s7, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 @@ -590,16 +590,16 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v7, s0 ; GFX10-NEXT: v_sub_co_u32 v10, s0, v8, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v5, s0, 0, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v7, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_xor_b32_e32 v2, s4, v2 ; GFX10-NEXT: v_xor_b32_e32 v3, s5, v3 @@ -1308,11 +1308,11 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: s_subb_u32 s17, 0, s9 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX8-NEXT: v_trunc_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -1386,163 +1386,162 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v4, s11, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, s11, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v3, v2 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2] ; GFX8-NEXT: v_sub_u32_e32 v9, vcc, s10, v0 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: s_ashr_i32 s10, s3, 31 ; GFX8-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v4, vcc ; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v4 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s8, v9 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX8-NEXT: v_subbrev_u32_e64 v10, s[0:1], 0, v0, vcc -; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], 1, v6 -; GFX8-NEXT: v_addc_u32_e64 v5, s[0:1], 0, v7, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v9 +; GFX8-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v0, vcc +; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], 1, v6 +; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v4 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 +; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, s8, v3 +; GFX8-NEXT: s_ashr_i32 s8, s3, 31 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v3 -; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v5, s[0:1] -; GFX8-NEXT: s_add_u32 s0, s18, s6 -; GFX8-NEXT: s_addc_u32 s1, s19, s6 -; GFX8-NEXT: s_add_u32 s2, s2, s10 -; GFX8-NEXT: s_mov_b32 s11, s10 -; GFX8-NEXT: s_addc_u32 s3, s3, s10 -; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] -; GFX8-NEXT: v_cvt_f32_u32_e32 v14, s3 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v4 -; GFX8-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v0, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v14 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v5 +; GFX8-NEXT: s_add_u32 s10, s18, s6 +; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] +; GFX8-NEXT: s_addc_u32 s11, s19, s6 +; GFX8-NEXT: s_add_u32 s0, s2, s8 +; GFX8-NEXT: s_mov_b32 s9, s8 +; GFX8-NEXT: s_addc_u32 s1, s3, s8 +; GFX8-NEXT: s_xor_b64 s[2:3], s[0:1], s[8:9] +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX8-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v0, vcc +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc -; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] -; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v11, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v11 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v0 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_xor_b64 s[10:11], s[10:11], s[6:7] ; GFX8-NEXT: s_sub_u32 s5, 0, s2 -; GFX8-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v12, 0 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v11, v[1:2] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc +; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX8-NEXT: v_trunc_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v11, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[18:19], s5, v11, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v3, v14, vcc ; GFX8-NEXT: s_subb_u32 s20, 0, s3 -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v12, v[1:2] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v12, v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v1, v11, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, v12, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[0:1] -; GFX8-NEXT: v_mul_hi_u32 v2, v12, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v11, v0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v11, v[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v1, v12, v0 ; GFX8-NEXT: v_mul_lo_u32 v2, v11, v4 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v3, v12, v4 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, v11, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v12, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v12, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v11, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 -; GFX8-NEXT: v_mul_hi_u32 v3, v11, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v12, v4 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v12, v0 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v11, v0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v10, 0 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v11, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v1, s16, v6 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1] -; GFX8-NEXT: v_xor_b32_e32 v5, s17, v7 -; GFX8-NEXT: v_mov_b32_e32 v6, s17 -; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s16, v1 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v5, v6, vcc -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4] +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v12, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v11, v[3:4] +; GFX8-NEXT: v_xor_b32_e32 v6, s16, v6 +; GFX8-NEXT: v_xor_b32_e32 v1, s17, v7 +; GFX8-NEXT: v_mov_b32_e32 v7, s17 +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s16, v6 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s20, v10, v[4:5] ; GFX8-NEXT: v_mul_lo_u32 v4, v11, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, s4, v9 -; GFX8-NEXT: v_mul_lo_u32 v7, v10, v5 +; GFX8-NEXT: v_mul_lo_u32 v7, v10, v6 ; GFX8-NEXT: v_mul_hi_u32 v9, v10, v2 ; GFX8-NEXT: v_mul_hi_u32 v2, v11, v2 -; GFX8-NEXT: v_xor_b32_e32 v6, s4, v8 +; GFX8-NEXT: v_xor_b32_e32 v5, s4, v8 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v9, v11, v5 +; GFX8-NEXT: v_mul_lo_u32 v9, v11, v6 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 -; GFX8-NEXT: v_mul_hi_u32 v7, v10, v5 +; GFX8-NEXT: v_mul_hi_u32 v7, v10, v6 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_mul_hi_u32 v5, v11, v5 +; GFX8-NEXT: v_mul_hi_u32 v6, v11, v6 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v10, v2 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v11, v4, vcc -; GFX8-NEXT: v_mul_lo_u32 v9, s9, v2 -; GFX8-NEXT: v_mul_lo_u32 v10, s8, v7 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v11, v4, vcc +; GFX8-NEXT: v_mul_lo_u32 v7, s11, v2 +; GFX8-NEXT: v_mul_lo_u32 v9, s10, v6 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v3 -; GFX8-NEXT: v_mul_hi_u32 v3, s8, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, s10, v2 ; GFX8-NEXT: v_mov_b32_e32 v8, s4 -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v8, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v9, v10 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v5, v8, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, s9, v7 -; GFX8-NEXT: v_mul_hi_u32 v2, s9, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, s11, v6 +; GFX8-NEXT: v_mul_hi_u32 v2, s11, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v8, v3 -; GFX8-NEXT: v_mul_hi_u32 v8, s8, v7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_mul_hi_u32 v8, s10, v6 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v2, v3 -; GFX8-NEXT: v_mul_hi_u32 v7, s9, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_mul_hi_u32 v6, s11, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 -; GFX8-NEXT: v_add_u32_e32 v11, vcc, v7, v6 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, v6, v7 ; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v12, s9 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2 +; GFX8-NEXT: v_mov_b32_e32 v12, s11 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s10, v2 ; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v12, v8, vcc -; GFX8-NEXT: v_sub_u32_e64 v7, s[0:1], s9, v8 +; GFX8-NEXT: v_sub_u32_e64 v7, s[0:1], s11, v8 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 @@ -1575,7 +1574,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] -; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] +; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] ; GFX8-NEXT: v_xor_b32_e32 v2, s0, v7 ; GFX8-NEXT: v_xor_b32_e32 v3, s1, v8 ; GFX8-NEXT: v_mov_b32_e32 v7, s1 @@ -1619,11 +1618,11 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: s_subb_u32 s17, 0, s9 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -1703,152 +1702,151 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s10, v0 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_ashr_i32 s10, s3, 31 ; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v8, v4, vcc -; GFX9-NEXT: v_sub_u32_e32 v0, s11, v4 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 +; GFX9-NEXT: v_sub_u32_e32 v0, s11, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 -; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s8, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[0:1] -; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v0, vcc -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v6 -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[0:1], 0, v7, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], 1, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v13, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s18, s6 -; GFX9-NEXT: s_addc_u32 s1, s19, s6 -; GFX9-NEXT: s_add_u32 s2, s2, s10 -; GFX9-NEXT: s_mov_b32 s11, s10 -; GFX9-NEXT: s_addc_u32 s3, s3, s10 -; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: v_cvt_f32_u32_e32 v14, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s2 ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v14 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v15 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_subrev_co_u32_e32 v14, vcc, s8, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s8, v9 +; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v0, vcc +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], 1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v10, s[0:1], 0, v7, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v14, vcc, s8, v3 +; GFX9-NEXT: s_ashr_i32 s8, s3, 31 +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], 1, v5 +; GFX9-NEXT: s_add_u32 s10, s18, s6 +; GFX9-NEXT: v_addc_co_u32_e64 v13, s[0:1], 0, v10, s[0:1] +; GFX9-NEXT: s_addc_u32 s11, s19, s6 +; GFX9-NEXT: s_add_u32 s0, s2, s8 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_addc_u32 s1, s3, s8 +; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], s[8:9] +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v16, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v0 -; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_xor_b64 s[10:11], s[10:11], s[6:7] ; GFX9-NEXT: s_sub_u32 s5, 0, s2 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v12, vcc -; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v3, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v11, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[18:19], s5, v11, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v14, vcc ; GFX9-NEXT: s_subb_u32 s20, 0, s3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v12, v[1:2] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, v5, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v17, v[2:3] -; GFX9-NEXT: v_mul_lo_u32 v2, v12, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v14, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v17, v4 -; GFX9-NEXT: v_mul_hi_u32 v10, v17, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v13, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v12, v4 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v12, v[1:2] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v11, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v1, v12, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v11, v4 +; GFX9-NEXT: v_mul_hi_u32 v3, v11, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, v12, v0 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, v17, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v12, v4 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v12, v4 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v11, v4 +; GFX9-NEXT: v_xor_b32_e32 v6, s16, v6 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v10, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v17, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v12, v2, vcc -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0 -; GFX9-NEXT: v_xor_b32_e32 v1, s16, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1] -; GFX9-NEXT: v_xor_b32_e32 v8, s17, v5 -; GFX9-NEXT: v_mov_b32_e32 v12, s17 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4] -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v1 -; GFX9-NEXT: v_mul_lo_u32 v3, v11, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, v10, v5 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v12, vcc -; GFX9-NEXT: v_mul_hi_u32 v8, v10, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v11, v5 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v12, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v0 +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v10, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v12, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v11, v[3:4] +; GFX9-NEXT: v_xor_b32_e32 v1, s17, v7 +; GFX9-NEXT: v_mov_b32_e32 v7, s17 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s20, v10, v[4:5] +; GFX9-NEXT: v_mul_lo_u32 v4, v11, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, s4, v9 +; GFX9-NEXT: v_mul_lo_u32 v7, v10, v6 +; GFX9-NEXT: v_mul_hi_u32 v9, v10, v2 ; GFX9-NEXT: v_mul_hi_u32 v2, v11, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 -; GFX9-NEXT: v_mul_hi_u32 v6, v10, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, v11, v5 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_add_u32_e32 v6, v8, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v3, v6, v3, v5 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s9, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, s8, v3 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v9 -; GFX9-NEXT: v_mul_hi_u32 v9, s8, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, s9, v2 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, s9, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_mul_hi_u32 v6, s8, v3 -; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3 +; GFX9-NEXT: v_xor_b32_e32 v5, s4, v8 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v9, v11, v6 +; GFX9-NEXT: v_add_u32_e32 v4, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v7, v10, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, v11, v6 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v6, v9, v6 -; GFX9-NEXT: v_xor_b32_e32 v7, s4, v7 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v4, v7, v4, v6 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v11, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, s11, v2 +; GFX9-NEXT: v_mul_lo_u32 v9, s10, v6 +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, s10, v2 ; GFX9-NEXT: v_mov_b32_e32 v8, s4 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_add3_u32 v11, v6, v11, v12 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v8, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v7, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, s11, v6 +; GFX9-NEXT: v_mul_hi_u32 v2, s11, v2 +; GFX9-NEXT: v_add_u32_e32 v3, v8, v3 +; GFX9-NEXT: v_mul_hi_u32 v8, s10, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, s11, v6 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v3 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 +; GFX9-NEXT: v_add3_u32 v11, v7, v9, v6 ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4] -; GFX9-NEXT: v_mov_b32_e32 v12, s9 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v2 +; GFX9-NEXT: v_mov_b32_e32 v12, s11 +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s10, v2 ; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v12, v8, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6 -; GFX9-NEXT: v_sub_u32_e32 v7, s9, v8 +; GFX9-NEXT: v_sub_u32_e32 v7, s11, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] @@ -1880,7 +1878,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] -; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] +; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] ; GFX9-NEXT: v_xor_b32_e32 v2, s0, v7 ; GFX9-NEXT: v_xor_b32_e32 v3, s1, v8 ; GFX9-NEXT: v_mov_b32_e32 v7, s1 @@ -1917,21 +1915,21 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: s_subb_u32 s20, 0, s7 ; GFX10-NEXT: s_xor_b64 s[16:17], s[4:5], s[8:9] ; GFX10-NEXT: s_ashr_i32 s8, s19, 31 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX10-NEXT: s_ashr_i32 s10, s3, 31 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 ; GFX10-NEXT: s_add_u32 s18, s18, s8 ; GFX10-NEXT: s_addc_u32 s19, s19, s8 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 ; GFX10-NEXT: s_add_u32 s2, s2, s10 ; GFX10-NEXT: s_mov_b32 s11, s10 ; GFX10-NEXT: s_addc_u32 s3, s3, s10 -; GFX10-NEXT: s_mov_b32 s9, s8 -; GFX10-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX10-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] +; GFX10-NEXT: s_mov_b32 s9, s8 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s2 -; GFX10-NEXT: s_xor_b64 s[18:19], s[18:19], s[8:9] ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: s_xor_b64 s[18:19], s[18:19], s[8:9] ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1940,256 +1938,253 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_trunc_f32_e32 v2, v2 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v1 ; GFX10-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 +; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v2 ; GFX10-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v2 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-NEXT: v_trunc_f32_e32 v6, v4 -; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v0 -; GFX10-NEXT: v_mul_f32_e32 v4, 0xcf800000, v6 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s21, v7, 0 +; GFX10-NEXT: v_trunc_f32_e32 v5, v4 +; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX10-NEXT: v_mul_f32_e32 v4, 0xcf800000, v5 +; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v5 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s21, v6, 0 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX10-NEXT: s_sub_u32 s5, 0, s2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v3 -; GFX10-NEXT: v_mul_hi_u32 v10, v9, v0 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s22, s5, v8, 0 -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s22, s21, v9, v[1:2] -; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v6 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v6, v7, v0 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s5, s21, v7, v[1:2] +; GFX10-NEXT: s_sub_u32 s5, 0, s2 +; GFX10-NEXT: v_mul_lo_u32 v10, v7, v0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s23, s5, v8, 0 ; GFX10-NEXT: s_subb_u32 s22, 0, s3 -; GFX10-NEXT: v_mul_hi_u32 v12, v8, v2 -; GFX10-NEXT: v_mul_lo_u32 v11, v5, v2 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s23, s20, v7, v[4:5] -; GFX10-NEXT: v_mul_lo_u32 v4, v9, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s5, v5, v[1:2] -; GFX10-NEXT: v_mul_hi_u32 v2, v5, v2 -; GFX10-NEXT: v_mul_lo_u32 v13, v7, v3 -; GFX10-NEXT: v_mul_lo_u32 v14, v9, v3 -; GFX10-NEXT: v_mul_hi_u32 v15, v7, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s22, v8, v[0:1] -; GFX10-NEXT: v_mul_hi_u32 v1, v9, v3 -; GFX10-NEXT: v_add_co_u32 v3, s23, v4, v13 +; GFX10-NEXT: v_mul_hi_u32 v12, v7, v0 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s23, s20, v6, v[3:4] +; GFX10-NEXT: v_mul_hi_u32 v11, v6, v0 +; GFX10-NEXT: v_mul_hi_u32 v14, v9, v1 +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s23, s5, v9, v[2:3] +; GFX10-NEXT: v_mul_hi_u32 v5, v8, v1 +; GFX10-NEXT: v_mul_lo_u32 v13, v6, v3 +; GFX10-NEXT: v_mul_lo_u32 v15, v7, v3 +; GFX10-NEXT: v_mul_lo_u32 v2, v9, v1 +; GFX10-NEXT: v_mul_hi_u32 v16, v6, v3 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s22, v8, v[4:5] +; GFX10-NEXT: v_mul_hi_u32 v1, v7, v3 +; GFX10-NEXT: v_add_co_u32 v3, s23, v10, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v10, s23, v14, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s23 -; GFX10-NEXT: v_mul_lo_u32 v14, v8, v0 -; GFX10-NEXT: v_add_co_u32 v3, s23, v3, v6 +; GFX10-NEXT: v_add_co_u32 v10, s23, v15, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s23 +; GFX10-NEXT: v_mul_lo_u32 v13, v8, v0 +; GFX10-NEXT: v_add_co_u32 v3, s23, v3, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v6, s23, v10, v15 -; GFX10-NEXT: v_mul_lo_u32 v15, v5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s23 +; GFX10-NEXT: v_mul_lo_u32 v15, v9, v0 +; GFX10-NEXT: v_add_co_u32 v10, s23, v10, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s23 ; GFX10-NEXT: v_mul_hi_u32 v16, v8, v0 -; GFX10-NEXT: v_mul_hi_u32 v17, v5, v0 +; GFX10-NEXT: v_mul_hi_u32 v17, v9, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v4, v3 -; GFX10-NEXT: v_add_co_u32 v4, s23, v11, v14 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v13, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v2, s23, v15, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v0, s23, v6, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v4, s23, v4, v12 +; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v13 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v12, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v16 -; GFX10-NEXT: v_add3_u32 v1, v3, v6, v1 -; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v7, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v10, v4 +; GFX10-NEXT: v_add_co_u32 v11, s23, v15, v14 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s23 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v9, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, s23, v10, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s23 +; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s23 +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v6, v0 +; GFX10-NEXT: v_add3_u32 v1, v3, v10, v1 +; GFX10-NEXT: v_add_co_u32 v5, s23, v11, v16 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v4, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s23 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v1, vcc_lo ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s21, v6, 0 -; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v11, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s23 -; GFX10-NEXT: v_mov_b32_e32 v10, 0 +; GFX10-NEXT: v_add_co_u32 v2, s23, v5, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v12, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s23 ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v8, v2 -; GFX10-NEXT: v_mul_hi_u32 v11, v7, v0 -; GFX10-NEXT: v_add3_u32 v3, v4, v3, v17 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v3, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s23, s5, v8, 0 -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s21, s21, v7, v[1:2] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mul_lo_u32 v12, v9, v2 -; GFX10-NEXT: v_mul_hi_u32 v13, v8, v2 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s20, s20, v6, v[4:5] -; GFX10-NEXT: v_mul_lo_u32 v4, v7, v0 -; GFX10-NEXT: v_mul_hi_u32 v5, v6, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s5, v9, v[1:2] -; GFX10-NEXT: v_mul_hi_u32 v2, v9, v2 -; GFX10-NEXT: v_mul_lo_u32 v14, v6, v3 +; GFX10-NEXT: v_mul_lo_u32 v10, v7, v0 +; GFX10-NEXT: v_add3_u32 v5, v3, v4, v17 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s21, s21, v7, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s21, s5, v8, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v5, vcc_lo +; GFX10-NEXT: v_mul_hi_u32 v12, v7, v0 +; GFX10-NEXT: v_mul_hi_u32 v11, v6, v0 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s20, s20, v6, v[3:4] +; GFX10-NEXT: v_mul_hi_u32 v14, v9, v1 +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s5, s5, v9, v[2:3] +; GFX10-NEXT: v_mul_hi_u32 v5, v8, v1 +; GFX10-NEXT: v_mul_lo_u32 v13, v6, v3 ; GFX10-NEXT: v_mul_lo_u32 v15, v7, v3 +; GFX10-NEXT: v_mul_lo_u32 v2, v9, v1 ; GFX10-NEXT: v_mul_hi_u32 v16, v6, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s22, v8, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s22, v8, v[4:5] ; GFX10-NEXT: v_mul_hi_u32 v1, v7, v3 -; GFX10-NEXT: v_add_co_u32 v3, s5, v4, v14 +; GFX10-NEXT: v_add_co_u32 v3, s5, v10, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v11, s5, v15, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v5 -; GFX10-NEXT: v_mul_lo_u32 v15, v8, v0 +; GFX10-NEXT: v_add_co_u32 v10, s5, v15, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 +; GFX10-NEXT: v_mul_lo_u32 v13, v8, v0 +; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v5, s5, v11, v16 -; GFX10-NEXT: v_mul_lo_u32 v16, v9, v0 +; GFX10-NEXT: v_mul_lo_u32 v15, v9, v0 +; GFX10-NEXT: v_add_co_u32 v10, s5, v10, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-NEXT: v_mul_hi_u32 v17, v8, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, v9, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v14, v11 -; GFX10-NEXT: v_add_co_u32 v11, s5, v12, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v2, s5, v16, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v3, s5, v5, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v11, s5, v11, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v17 -; GFX10-NEXT: v_add3_u32 v1, v4, v5, v1 -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v6, v3 +; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v13 +; GFX10-NEXT: v_mul_hi_u32 v16, v8, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v12, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v12, s5, v15, v14 ; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v3, s5, v10, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v6, v3 +; GFX10-NEXT: v_add3_u32 v1, v4, v10, v1 +; GFX10-NEXT: v_add_co_u32 v5, s5, v12, v16 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v11, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v1, vcc_lo +; GFX10-NEXT: v_mul_hi_u32 v0, v9, v0 ; GFX10-NEXT: v_mul_lo_u32 v6, s1, v3 -; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v14, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 -; GFX10-NEXT: v_mul_lo_u32 v11, s0, v1 +; GFX10-NEXT: v_add_co_u32 v2, s5, v5, v2 +; GFX10-NEXT: v_mul_lo_u32 v10, s0, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v13, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5 ; GFX10-NEXT: v_mul_hi_u32 v7, s0, v3 ; GFX10-NEXT: v_mul_hi_u32 v3, s1, v3 -; GFX10-NEXT: v_mul_lo_u32 v12, s1, v1 -; GFX10-NEXT: v_add3_u32 v0, v5, v4, v0 +; GFX10-NEXT: v_mul_lo_u32 v11, s1, v1 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2 +; GFX10-NEXT: v_add3_u32 v0, v4, v5, v0 ; GFX10-NEXT: v_mul_hi_u32 v4, s0, v1 ; GFX10-NEXT: v_mul_hi_u32 v5, s1, v1 -; GFX10-NEXT: v_add_co_u32 v1, s5, v6, v11 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v9, v0, vcc_lo +; GFX10-NEXT: v_add_co_u32 v1, s5, v6, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v3, s5, v12, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v3, s5, v11, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s5 ; GFX10-NEXT: v_add_co_u32 v1, s5, v1, v7 -; GFX10-NEXT: v_mul_lo_u32 v0, s19, v2 -; GFX10-NEXT: v_mul_lo_u32 v12, s18, v8 +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v9, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5 +; GFX10-NEXT: v_mul_lo_u32 v0, s19, v2 ; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v4 +; GFX10-NEXT: v_mul_lo_u32 v7, s18, v8 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v6, v1 ; GFX10-NEXT: v_mul_hi_u32 v9, s18, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 ; GFX10-NEXT: v_mul_hi_u32 v2, s19, v2 -; GFX10-NEXT: v_mul_lo_u32 v7, s19, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v6, v1 -; GFX10-NEXT: v_add_co_u32 v6, s5, v0, v12 -; GFX10-NEXT: v_mul_hi_u32 v13, s18, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v11, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v12, s5, v3, v1 -; GFX10-NEXT: v_add_co_u32 v2, s20, v7, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s5 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s6, v12, 0 -; GFX10-NEXT: v_add_co_u32 v6, s5, v6, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v9, s5, v2, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 -; GFX10-NEXT: v_add3_u32 v4, v4, v7, v5 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v11, v6 -; GFX10-NEXT: v_mul_hi_u32 v5, s19, v8 -; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v12, 1 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 +; GFX10-NEXT: v_mul_lo_u32 v6, s19, v8 +; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v1 +; GFX10-NEXT: v_add_co_u32 v7, s20, v0, v7 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v10, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s6, v3, 0 +; GFX10-NEXT: v_mul_hi_u32 v11, s18, v8 +; GFX10-NEXT: v_add_co_u32 v6, s5, v6, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s5 +; GFX10-NEXT: v_add3_u32 v4, v4, v12, v5 +; GFX10-NEXT: v_add_co_u32 v2, s5, v7, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s20 +; GFX10-NEXT: v_mul_hi_u32 v7, s19, v8 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, s6, v4, v[1:2] -; GFX10-NEXT: v_add_co_u32 v6, s5, v9, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v4, vcc_lo -; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v7, 1 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, s7, v12, v[1:2] -; GFX10-NEXT: v_add3_u32 v5, v3, v9, v5 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s5, s2, v6, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v8, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s0, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v9, s1, v1 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v15, s0, s1, v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v3, vcc_lo, v14, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s0, 0, v9, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v14 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo +; GFX10-NEXT: v_add_co_u32 v6, s5, v6, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v10, v5 +; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v3, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v4, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v8, v13, v2 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, s7, v3, v[1:2] +; GFX10-NEXT: v_add_co_u32 v5, s5, v6, v5 +; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, s0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 +; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_sub_nc_u32_e32 v6, s1, v1 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v13, s0, s1, v1, vcc_lo +; GFX10-NEXT: v_add3_u32 v7, v8, v2, v7 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v15, vcc_lo, v12, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s0, 0, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v13 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v15 ; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, -1, s0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v5, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v5, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v14, v17, v14, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v19, v18, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v15 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v20, v17, s0 -; GFX10-NEXT: v_sub_co_u32 v1, s0, v3, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v9, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s3, v6, v[0:1] -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v17 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v12, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v16, v9, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s18, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s0 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s1, s19, v0, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s19, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v14, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v8 -; GFX10-NEXT: v_xor_b32_e32 v1, s16, v1 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v0, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 -; GFX10-NEXT: v_xor_b32_e32 v4, s17, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0 -; GFX10-NEXT: v_xor_b32_e32 v3, s4, v3 -; GFX10-NEXT: v_xor_b32_e32 v7, s4, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v17, v19, v18, s0 +; GFX10-NEXT: v_add_co_u32 v18, s0, v10, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v19, s0, 0, v11, s0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s2, v7, v[1:2] +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v17 +; GFX10-NEXT: v_sub_co_u32 v2, s0, v15, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v18, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v19, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v11, v15, v2, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s1, s3, v5, v[1:2] +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, v10, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v16, v6, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, s18, v0 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v10, s1, s19, v1, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s19, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, v11, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v13, v4, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v10 +; GFX10-NEXT: v_xor_b32_e32 v0, s16, v2 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v1, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v8 +; GFX10-NEXT: v_xor_b32_e32 v2, s17, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, -1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v2, s2 +; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v8, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v14, s0, 0, v11, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v0, s0, v1, s16 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s17, v4, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v8 +; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, s16 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s17, v2, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v10 +; GFX10-NEXT: v_xor_b32_e32 v2, s4, v6 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v9, v12, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v12, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 -; GFX10-NEXT: v_add_co_u32 v15, s0, v6, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v5, s0 +; GFX10-NEXT: v_add_co_u32 v15, s0, v5, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v7, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v12, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v12, s0 ; GFX10-NEXT: v_add_co_u32 v12, s0, v15, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s0, 0, v16, s0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: v_sub_co_u32 v9, s0, v13, s2 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX10-NEXT: v_sub_co_u32 v6, s0, v13, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v11, s0, 0, v11, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v15, v12, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v4 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v13, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v9, v14, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v12, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v5, v15, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v14, v11, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v11, s4, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v5, v12, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v15, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v6, s0 ; GFX10-NEXT: s_xor_b64 s[0:1], s[8:9], s[10:11] -; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v3, s4 -; GFX10-NEXT: v_xor_b32_e32 v3, s0, v6 -; GFX10-NEXT: v_xor_b32_e32 v6, s1, v11 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v7, vcc_lo -; GFX10-NEXT: v_xor_b32_e32 v7, s8, v2 -; GFX10-NEXT: v_xor_b32_e32 v8, s8, v8 -; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v3, s0 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v6, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s8 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s8, v8, vcc_lo -; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[12:13] -; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[14:15] +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v2, s4 +; GFX10-NEXT: v_xor_b32_e32 v2, s0, v12 +; GFX10-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX10-NEXT: v_xor_b32_e32 v8, s8, v3 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v11, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v10, s8, v6 +; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v2, s0 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v7, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v8, s8 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s8, v10, vcc_lo +; GFX10-NEXT: global_store_dwordx4 v9, v[0:3], s[12:13] +; GFX10-NEXT: global_store_dwordx4 v9, v[4:7], s[14:15] ; GFX10-NEXT: s_endpgm %div = sdiv <2 x i64> %x, %y store <2 x i64> %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 6f42239cd191d..39cf7b01fd6c0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -31,28 +31,28 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v1 -; CHECK-NEXT: v_sub_i32_e32 v11, vcc, 0, v0 -; CHECK-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v0 +; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v6, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v6 -; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v13, v6 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v3, v13, v2 -; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v6, v10, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, v10, v8 -; CHECK-NEXT: v_mul_lo_u32 v9, v13, v8 +; CHECK-NEXT: v_trunc_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v11, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[6:7] +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, v11, v8 +; CHECK-NEXT: v_mul_lo_u32 v9, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v10, v8 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 @@ -60,53 +60,53 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_mul_hi_u32 v7, v13, v8 +; CHECK-NEXT: v_mul_hi_u32 v7, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v2 -; CHECK-NEXT: v_addc_u32_e32 v13, vcc, v13, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v11, 31, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v11 -; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v11, vcc -; CHECK-NEXT: v_xor_b32_e32 v9, v3, v11 -; CHECK-NEXT: v_mul_lo_u32 v3, v13, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, v10, v8 -; CHECK-NEXT: v_xor_b32_e32 v12, v4, v11 -; CHECK-NEXT: v_mul_hi_u32 v4, v10, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v2 +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v11, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v12 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[6:7] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v12, vcc +; CHECK-NEXT: v_xor_b32_e32 v9, v3, v12 +; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v11, v8 +; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12 +; CHECK-NEXT: v_mul_hi_u32 v4, v11, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v13, v8 +; CHECK-NEXT: v_mul_lo_u32 v4, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v5, v10, v8 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v13, v8 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v12, v2 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v11, v2 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v10, v3, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v13, v2 ; CHECK-NEXT: v_mul_lo_u32 v5, v9, v3 ; CHECK-NEXT: v_mul_hi_u32 v6, v9, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v12, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v12, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v13, v3 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CHECK-NEXT: v_mul_hi_u32 v5, v9, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 @@ -115,16 +115,16 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v2, v4 -; CHECK-NEXT: v_mul_hi_u32 v6, v12, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v2 +; CHECK-NEXT: v_mul_hi_u32 v5, v13, v3 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v5, v4 ; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4] ; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 ; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5] -; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v6, vcc -; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v6 +; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v6, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v13, v6 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 @@ -149,10 +149,10 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v11 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v11 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v12 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v12 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -212,10 +212,10 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: s_subb_u32 s5, 0, s9 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; CHECK-NEXT: v_trunc_f32_e32 v2, v1 -; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 +; CHECK-NEXT: v_trunc_f32_e32 v1, v1 +; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v1 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] ; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -286,11 +286,11 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v4, s11, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, s11, v1 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v2 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v3, v2 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[1:2] ; CHECK-NEXT: v_mov_b32_e32 v1, s9 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 @@ -372,212 +372,209 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5 ; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v5 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v8, vcc ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v11, v9 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v11 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v17, v9 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11] -; GISEL-NEXT: v_mul_hi_u32 v11, v14, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v17, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v4, 0 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v15, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v10, v15, v9 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v4, v[11:12] +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v4, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; GISEL-NEXT: v_mul_hi_u32 v10, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v11, v4, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v13 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v4 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5] +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v4, v9 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v15, v10, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v18, 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v15, v[10:11] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v17, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, v14, v12 -; GISEL-NEXT: v_xor_b32_e32 v15, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v14, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v17, v9 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v18, v[11:12] +; GISEL-NEXT: v_xor_b32_e32 v14, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v0, v15, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v18, v13 +; GISEL-NEXT: v_xor_b32_e32 v16, v1, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, v18, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 +; GISEL-NEXT: v_mul_lo_u32 v1, v15, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v10, v14, v12 +; GISEL-NEXT: v_mul_hi_u32 v10, v18, v13 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v17, v12 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v14, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1 +; GISEL-NEXT: v_mul_lo_u32 v11, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v10, v13, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v1, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v1, v0 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v6, v6, v9 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v9 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v7 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v16 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v16, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v10, v9 ; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v11, v[1:2] -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v12 -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v14, v[9:10] -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v12, v9 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v12 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v14, 0 -; GISEL-NEXT: v_sub_i32_e32 v19, vcc, v13, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v18, v[0:1] -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v15, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v14, v[12:13] -; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v15, v11 -; GISEL-NEXT: v_mul_lo_u32 v11, v18, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, v14, v0 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v8 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v14, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_hi_u32 v9, v18, v9 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v19, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v10, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[6:7] -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v19, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v15, s[6:7], 0, v1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v8 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v15, v8 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v12, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v13, v[9:10] +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v14, v0 +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], v16, v11, vcc +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v16, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v8 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v1, v9, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v0, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v7 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v1 +; GISEL-NEXT: v_addc_u32_e64 v7, s[4:5], v7, v1, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v1 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v1 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5 +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v0, v8, vcc +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v1, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 +; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v17, 0 +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v18, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v22, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v13, v5 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v16, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v17, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v21, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_mul_hi_u32 v8, v17, v0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v1 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v18, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v1 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v18, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v14, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v10, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v13, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v12, v[8:9] -; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v5 -; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v12, v10 -; GISEL-NEXT: v_xor_b32_e32 v15, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v15, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v20, v14, v[8:9] +; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v11, v2, v12 +; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v14, v10 +; GISEL-NEXT: v_xor_b32_e32 v13, v3, v12 +; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_mul_hi_u32 v8, v12, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v15, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 ; GISEL-NEXT: v_mul_lo_u32 v8, v11, v2 ; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_xor_b32_e32 v10, v14, v4 +; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v15, v2 +; GISEL-NEXT: v_mul_lo_u32 v9, v13, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 ; GISEL-NEXT: v_mul_hi_u32 v8, v11, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 @@ -585,48 +582,47 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v9, v15, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v9, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v13, v[0:1] +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v0, v[3:4] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[8:9] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v10, v[8:9] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v7 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v6 -; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v6 +; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v7 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v7 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v5 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64: @@ -651,28 +647,28 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v1, v2, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1 -; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v0 -; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 +; CGP-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CGP-NEXT: v_trunc_f32_e32 v4, v3 -; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v17, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4] -; CGP-NEXT: v_mul_lo_u32 v3, v17, v2 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5] -; CGP-NEXT: v_mul_hi_u32 v4, v14, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v17, v2 -; CGP-NEXT: v_mul_lo_u32 v5, v14, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v17, v12 +; CGP-NEXT: v_trunc_f32_e32 v3, v3 +; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v3 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v15, 0 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[3:4] +; CGP-NEXT: v_mul_lo_u32 v3, v14, v2 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v15, v[4:5] +; CGP-NEXT: v_mul_hi_u32 v4, v15, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v15, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v14, v12 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v14, v12 +; CGP-NEXT: v_mul_hi_u32 v4, v15, v12 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 @@ -680,53 +676,53 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_mul_hi_u32 v5, v17, v12 +; CGP-NEXT: v_mul_hi_u32 v5, v14, v12 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v2 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v17, v3, vcc -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4] -; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v11 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v15 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5] -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v15, vcc -; CGP-NEXT: v_xor_b32_e32 v13, v3, v15 -; CGP-NEXT: v_mul_lo_u32 v3, v17, v2 -; CGP-NEXT: v_mul_lo_u32 v5, v14, v12 -; CGP-NEXT: v_xor_b32_e32 v16, v4, v15 -; CGP-NEXT: v_mul_hi_u32 v4, v14, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v17, v2 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v2 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v3, vcc +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v15, 0 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[3:4] +; CGP-NEXT: v_ashrrev_i32_e32 v16, 31, v11 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v16 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v15, v[4:5] +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v13, v3, v16 +; CGP-NEXT: v_mul_lo_u32 v3, v14, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v15, v12 +; CGP-NEXT: v_xor_b32_e32 v17, v4, v16 +; CGP-NEXT: v_mul_hi_u32 v4, v15, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v4, v17, v12 +; CGP-NEXT: v_mul_lo_u32 v4, v14, v12 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_mul_hi_u32 v5, v14, v12 +; CGP-NEXT: v_mul_hi_u32 v5, v15, v12 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v17, v12 +; CGP-NEXT: v_mul_hi_u32 v5, v14, v12 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v14, v2 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v17, v3, vcc -; CGP-NEXT: v_mul_lo_u32 v4, v16, v2 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v14, v3, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v17, v2 ; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 ; CGP-NEXT: v_mul_hi_u32 v10, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v16, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v17, v2 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v16, v3 +; CGP-NEXT: v_mul_lo_u32 v10, v17, v3 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_mul_hi_u32 v5, v13, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2 @@ -735,16 +731,16 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v3 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v2 +; CGP-NEXT: v_mul_hi_u32 v5, v17, v3 ; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v12, 0 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v4 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v5, v4 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v10, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v13, v2 ; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v1, v12, v[4:5] -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v16, v10, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v16, v10 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v17, v10, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v17, v10 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 @@ -769,10 +765,10 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v15 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v15 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v15 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v15, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v16 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v16 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 ; CGP-NEXT: .LBB2_2: ; %Flow1 @@ -820,28 +816,28 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v3, v4, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3 -; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v6, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6] -; CGP-NEXT: v_mul_lo_u32 v5, v15, v4 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7] -; CGP-NEXT: v_mul_hi_u32 v6, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 -; CGP-NEXT: v_mul_lo_u32 v7, v12, v10 -; CGP-NEXT: v_mul_lo_u32 v11, v15, v10 +; CGP-NEXT: v_trunc_f32_e32 v5, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v5 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[5:6] +; CGP-NEXT: v_mul_lo_u32 v5, v12, v4 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[6:7] +; CGP-NEXT: v_mul_hi_u32 v6, v13, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v13, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v12, v10 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v10 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v10 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 @@ -849,53 +845,53 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_mul_hi_u32 v7, v15, v10 +; CGP-NEXT: v_mul_hi_u32 v7, v12, v10 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v4 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v5, vcc -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v13 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7] -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v13 -; CGP-NEXT: v_mul_lo_u32 v5, v15, v4 -; CGP-NEXT: v_mul_lo_u32 v7, v12, v10 -; CGP-NEXT: v_xor_b32_e32 v14, v6, v13 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4 +; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[5:6] +; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v14 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[6:7] +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v5, v14 +; CGP-NEXT: v_mul_lo_u32 v5, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v13, v10 +; CGP-NEXT: v_xor_b32_e32 v15, v6, v14 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v15, v10 +; CGP-NEXT: v_mul_lo_u32 v6, v12, v10 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v12, v10 +; CGP-NEXT: v_mul_hi_u32 v7, v13, v10 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v15, v10 +; CGP-NEXT: v_mul_hi_u32 v7, v12, v10 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v14, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc +; CGP-NEXT: v_mul_lo_u32 v6, v15, v4 ; CGP-NEXT: v_mul_lo_u32 v7, v11, v5 ; CGP-NEXT: v_mul_hi_u32 v8, v11, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v14, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v15, v5 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_mul_hi_u32 v7, v11, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 @@ -904,16 +900,16 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v4, v6 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v5 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v7, v15, v5 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v6 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v7, v6 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6] ; CGP-NEXT: v_sub_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v10, v[6:7] -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v14, v8, vcc -; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v14, v8 +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v15, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v15, v8 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 @@ -938,10 +934,10 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v13 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v13 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v14 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v14 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v14 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -982,10 +978,10 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v4, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 +; CHECK-NEXT: v_trunc_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v3 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 ; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] ; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 @@ -1061,11 +1057,11 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_mul_hi_u32 v7, v5, v1 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v7 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v2 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v3 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 ; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc @@ -1112,153 +1108,150 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v5 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11] -; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 -; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v7, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v14 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v19, v13 -; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15] +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15] +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v17, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v7, v18, v16 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v18, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v7, v19, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v19, v16 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v16 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7 -; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v19, v16 -; GISEL-NEXT: v_xor_b32_e32 v20, v1, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v14 +; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v20, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x1000 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v20, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v20, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v18, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v13 +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v1 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v1, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0 ; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] -; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14] -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v17, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v15 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v20, v15, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v18, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v15 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v19, v15, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 -; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v19, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v7, v13 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v19, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v18, v19, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_mul_hi_u32 v1, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v17, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v12, v16, v14, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v12, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9] -; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc -; GISEL-NEXT: v_xor_b32_e32 v9, v2, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v5, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; GISEL-NEXT: v_xor_b32_e32 v14, v2, v13 ; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 -; GISEL-NEXT: v_xor_b32_e32 v14, v3, v12 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 ; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1280,37 +1273,36 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v9, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v13, v7 +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v14, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v12, v9 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v14, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v14, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v15, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v9, vcc ; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v7 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v14, v7, vcc +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v7 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v15, v7, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -1330,10 +1322,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2k_denom: @@ -1346,176 +1338,173 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v8, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_trunc_f32_e32 v5, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v5 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] -; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 -; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v12 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 -; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc -; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v19, v13 -; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v18, v16 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v18, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v8, v12, vcc +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14] +; CGP-NEXT: v_mul_lo_u32 v17, v16, v12 +; CGP-NEXT: v_mul_hi_u32 v18, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v19, v16, v12 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v13, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v4, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v19, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v19, v16 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v16 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v15, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v19, v16 -; CGP-NEXT: v_xor_b32_e32 v17, v1, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v16, v12 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_mul_hi_u32 v17, v16, v12 +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v18, v0, v12 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v13 +; CGP-NEXT: v_xor_b32_e32 v19, v1, v12 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v17, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 -; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v17, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v17, v1 +; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v18, v17, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v0 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v16 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v13 -; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v17, v13, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v14 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v15, vcc, v18, v0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 +; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v19, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 ; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 -; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1] +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14] -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 -; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v19, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v18, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1] +; CGP-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v19, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v7, v13 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; CGP-NEXT: v_cndmask_b32_e32 v0, v18, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v19, v21, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v14, vcc -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v12, v16, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v13, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8] -; CGP-NEXT: v_xor_b32_e32 v1, v12, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v2, v12 -; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v10, v5 -; CGP-NEXT: v_xor_b32_e32 v8, v3, v12 -; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 +; CGP-NEXT: v_cndmask_b32_e32 v14, v15, v0, vcc +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_mul_hi_u32 v1, v7, v13 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v7, v0 +; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v1, s[4:5] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 +; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2] +; CGP-NEXT: v_xor_b32_e32 v1, v5, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8] +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v2, v13 +; CGP-NEXT: v_mul_lo_u32 v2, v10, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v9, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v3, v13 +; CGP-NEXT: v_mul_hi_u32 v3, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v10, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v9, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v14, v12 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v12 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 @@ -1530,11 +1519,11 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v2 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v8, v2 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v3 ; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v9 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v6 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 ; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc @@ -1558,10 +1547,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v13 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v13 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, ret <2 x i64> %result @@ -1578,10 +1567,10 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v4, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 +; CHECK-NEXT: v_trunc_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v3 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 ; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] ; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 @@ -1657,11 +1646,11 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_mul_hi_u32 v7, v5, v1 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v7 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v2 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v3 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 ; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc @@ -1708,153 +1697,150 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v5 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11] -; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 -; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v7, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v14 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v19, v13 -; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15] +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15] +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v17, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v7, v18, v16 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v18, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v7, v19, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v19, v16 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v16 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7 -; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v19, v16 -; GISEL-NEXT: v_xor_b32_e32 v20, v1, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v14 +; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v20, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v20, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v20, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v18, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v13 +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v1 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v1, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0 ; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] -; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14] -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v17, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v15 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v20, v15, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v18, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v15 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v19, v15, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 -; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v19, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v7, v13 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v19, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v18, v19, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_mul_hi_u32 v1, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v17, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v12, v16, v14, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v12, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9] -; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc -; GISEL-NEXT: v_xor_b32_e32 v9, v2, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v5, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; GISEL-NEXT: v_xor_b32_e32 v14, v2, v13 ; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 -; GISEL-NEXT: v_xor_b32_e32 v14, v3, v12 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 ; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1876,37 +1862,36 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v9, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v13, v7 +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v14, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v12, v9 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v14, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v14, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v15, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v9, vcc ; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v7 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v14, v7, vcc +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v7 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v15, v7, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -1926,10 +1911,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_oddk_denom: @@ -1942,176 +1927,173 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v8, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_trunc_f32_e32 v5, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v5 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] -; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 -; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v12 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 -; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc -; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v19, v13 -; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v18, v16 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v18, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v8, v12, vcc +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14] +; CGP-NEXT: v_mul_lo_u32 v17, v16, v12 +; CGP-NEXT: v_mul_hi_u32 v18, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v19, v16, v12 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v13, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v4, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v19, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v19, v16 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v16 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v15, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v19, v16 -; CGP-NEXT: v_xor_b32_e32 v17, v1, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v16, v12 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_mul_hi_u32 v17, v16, v12 +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v18, v0, v12 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v13 +; CGP-NEXT: v_xor_b32_e32 v19, v1, v12 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v17, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 -; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v17, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v17, v1 +; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v18, v17, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v0 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v16 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v13 -; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v17, v13, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v14 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v15, vcc, v18, v0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 +; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v19, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 ; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 -; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1] +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14] -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 -; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v19, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v18, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1] +; CGP-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v19, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v7, v13 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; CGP-NEXT: v_cndmask_b32_e32 v0, v18, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v19, v21, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v14, vcc -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v12, v16, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v13, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8] -; CGP-NEXT: v_xor_b32_e32 v1, v12, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v2, v12 -; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v10, v5 -; CGP-NEXT: v_xor_b32_e32 v8, v3, v12 -; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 +; CGP-NEXT: v_cndmask_b32_e32 v14, v15, v0, vcc +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_mul_hi_u32 v1, v7, v13 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v7, v0 +; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v1, s[4:5] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 +; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2] +; CGP-NEXT: v_xor_b32_e32 v1, v5, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8] +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v2, v13 +; CGP-NEXT: v_mul_lo_u32 v2, v10, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v9, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v3, v13 +; CGP-NEXT: v_mul_hi_u32 v3, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v10, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v9, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v14, v12 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v12 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 @@ -2126,11 +2108,11 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v2 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v8, v2 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v3 ; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v9 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v6 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 ; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc @@ -2154,10 +2136,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v13 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v13 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, ret <2 x i64> %result @@ -2193,102 +2175,100 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v1 -; CHECK-NEXT: v_sub_i32_e32 v11, vcc, 0, v0 -; CHECK-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v0 +; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v7, v5 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7 -; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v13, v7 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 -; CHECK-NEXT: v_mov_b32_e32 v2, v6 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3] -; CHECK-NEXT: v_mul_lo_u32 v2, v13, v5 -; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v6, v10, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v13, v5 -; CHECK-NEXT: v_mul_lo_u32 v7, v10, v8 -; CHECK-NEXT: v_mul_lo_u32 v9, v13, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v10, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CHECK-NEXT: v_trunc_f32_e32 v5, v5 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v5 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v2, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[6:7] +; CHECK-NEXT: v_mul_lo_u32 v6, v11, v5 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v2, v[7:8] +; CHECK-NEXT: v_mul_hi_u32 v7, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v11, v9 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v2, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_mul_hi_u32 v8, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_mul_hi_u32 v7, v13, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v2 -; CHECK-NEXT: v_addc_u32_e32 v13, vcc, v13, v5, vcc -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 -; CHECK-NEXT: v_mov_b32_e32 v2, v6 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3] -; CHECK-NEXT: v_ashrrev_i32_e32 v11, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v11 -; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v11, vcc -; CHECK-NEXT: v_xor_b32_e32 v9, v2, v11 -; CHECK-NEXT: v_mul_lo_u32 v2, v13, v5 -; CHECK-NEXT: v_mul_lo_u32 v4, v10, v8 -; CHECK-NEXT: v_xor_b32_e32 v12, v3, v11 -; CHECK-NEXT: v_mul_hi_u32 v3, v10, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v13, v5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v3, v13, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; CHECK-NEXT: v_mul_hi_u32 v4, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v6, vcc +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v2, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[6:7] +; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v2, v[7:8] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v12, vcc +; CHECK-NEXT: v_xor_b32_e32 v8, v3, v12 +; CHECK-NEXT: v_mul_lo_u32 v3, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v6, v2, v9 +; CHECK-NEXT: v_xor_b32_e32 v10, v4, v12 +; CHECK-NEXT: v_mul_hi_u32 v4, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v5, v13, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v11, v9 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v2, v9 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v12, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, v9, v3 -; CHECK-NEXT: v_mul_hi_u32 v6, v9, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v12, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v4, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v8, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v12, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v10, v3 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v5, v9, v3 +; CHECK-NEXT: v_mul_hi_u32 v5, v8, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v2, v4 -; CHECK-NEXT: v_mul_hi_u32 v6, v12, v3 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v2, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v2 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v9, 0 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v5, v4 ; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4] -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5] -; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v6, vcc -; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v6 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v9, v[4:5] +; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v10, v6, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v10, v6 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 @@ -2313,10 +2293,10 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v11 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v11 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v12 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v12 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -2351,224 +2331,220 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-LABEL: v_srem_v2i64_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v8, 0x1000 -; GISEL-NEXT: v_mov_b32_e32 v9, 0 -; GISEL-NEXT: v_lshl_b64 v[4:5], v[8:9], v4 +; GISEL-NEXT: v_mov_b32_e32 v10, 0x1000 +; GISEL-NEXT: v_mov_b32_e32 v11, 0 +; GISEL-NEXT: v_lshl_b64 v[4:5], v[10:11], v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v5, v7, vcc +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v5, v7, vcc ; GISEL-NEXT: v_xor_b32_e32 v5, v4, v7 -; GISEL-NEXT: v_xor_b32_e32 v7, v10, v7 +; GISEL-NEXT: v_xor_b32_e32 v7, v8, v7 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7 -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v7 +; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v5 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v7, vcc +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v8 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v12, v10 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v12 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v18, v10 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v18, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v4 +; GISEL-NEXT: v_trunc_f32_e32 v8, v8 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v4, 0 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v9, v16, v8 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v4, v[12:13] +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v8 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v13, v4, v14 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v18, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4 -; GISEL-NEXT: v_addc_u32_e32 v18, vcc, v18, v10, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v4, v8 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v19, 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[9:10] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v18, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 -; GISEL-NEXT: v_xor_b32_e32 v16, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v15, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v19, v[12:13] +; GISEL-NEXT: v_xor_b32_e32 v15, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v0, v16, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v19, v14 +; GISEL-NEXT: v_xor_b32_e32 v17, v1, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v18, v13 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v18, v13 +; GISEL-NEXT: v_mul_lo_u32 v1, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v9, v19, v14 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v16, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v0 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v18, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v16, v10 -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v11 -; GISEL-NEXT: v_lshl_b64 v[0:1], v[8:9], v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_mul_hi_u32 v9, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v19, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0 +; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], v6 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v8, v6 -; GISEL-NEXT: v_mul_hi_u32 v8, v16, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v8, v6 -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v13, 0 -; GISEL-NEXT: v_xor_b32_e32 v6, v0, v8 -; GISEL-NEXT: v_xor_b32_e32 v8, v1, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v8 -; GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v14, v[0:1] -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v15 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v14, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[10:11] -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v8, vcc -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v14 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v13, v10 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v13 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v15, 0 -; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], v16, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v19, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v19, v10 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v16, v0 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v20, v7 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v20, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v1, v11, s[6:7] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v9, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v14, s[6:7], 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v14, v7 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v14, v7 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v16, v16, v21, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v7, v14, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_hi_u32 v1, v19, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v19, v13 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v17, v1 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v19, v13 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v0 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v19, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v9, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v12, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v7, v14, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v15, v0 +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v17, v12, vcc +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v12 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v7 +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v13, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v6, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e64 v16, s[4:5], 0, v0, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v11 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v10, v1 +; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v11, v1, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v1 +; GISEL-NEXT: v_xor_b32_e32 v8, v8, v1 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v5 +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v16, v7 +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v0, v7, vcc +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v1, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 +; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v10, v11, s[4:5] +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v17, 0 +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v18, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v22, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v15, v5 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v7, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v14, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v18, v13, v[9:10] -; GISEL-NEXT: v_cndmask_b32_e32 v7, v20, v7, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v5 -; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v13, v11 -; GISEL-NEXT: v_xor_b32_e32 v15, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v15, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v17, v[9:10] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v21, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v1 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v18, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v15, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v13, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v16, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v7, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v20, v15, v[9:10] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_xor_b32_e32 v12, v2, v7 +; GISEL-NEXT: v_mul_lo_u32 v2, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v15, v11 +; GISEL-NEXT: v_xor_b32_e32 v13, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v11 +; GISEL-NEXT: v_mul_lo_u32 v3, v16, v11 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v13, v11 +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v11 +; GISEL-NEXT: v_mul_hi_u32 v9, v16, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v16, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 ; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 ; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v15, v2 +; GISEL-NEXT: v_mul_lo_u32 v10, v13, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 ; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 @@ -2577,26 +2553,25 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v13, v[0:1] +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v0, v[3:4] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v8 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v6 ; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v8 @@ -2611,13 +2586,13 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v5 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v7 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2_shl_denom: @@ -2645,103 +2620,100 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v1, v4, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v10, v1 -; CGP-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 -; CGP-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v17, vcc, 0, v0 +; CGP-NEXT: v_subb_u32_e32 v18, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v12, v10 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v18, v12 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v11 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v18, v10 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] -; CGP-NEXT: v_mul_hi_u32 v11, v15, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v18, v10 -; CGP-NEXT: v_mul_lo_u32 v12, v15, v13 -; CGP-NEXT: v_mul_lo_u32 v14, v18, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v15, v13 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; CGP-NEXT: v_trunc_f32_e32 v10, v10 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[11:12] +; CGP-NEXT: v_mul_lo_u32 v11, v16, v10 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v4, v[12:13] +; CGP-NEXT: v_mul_hi_u32 v12, v4, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 +; CGP-NEXT: v_mul_lo_u32 v13, v4, v14 +; CGP-NEXT: v_mul_lo_u32 v15, v16, v14 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v4, v14 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v13, v16, v14 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v12, v18, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v4 -; CGP-NEXT: v_addc_u32_e32 v18, vcc, v18, v10, vcc -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v11 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v16, 31, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v16 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v16, vcc -; CGP-NEXT: v_xor_b32_e32 v14, v4, v16 -; CGP-NEXT: v_mul_lo_u32 v4, v18, v10 -; CGP-NEXT: v_mul_lo_u32 v9, v15, v13 -; CGP-NEXT: v_xor_b32_e32 v17, v8, v16 -; CGP-NEXT: v_mul_hi_u32 v8, v15, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v18, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v18, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v15, v13 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[11:12] +; CGP-NEXT: v_ashrrev_i32_e32 v17, 31, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v17 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v4, v[12:13] +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v17, vcc +; CGP-NEXT: v_xor_b32_e32 v15, v8, v17 +; CGP-NEXT: v_mul_lo_u32 v8, v16, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v4, v14 +; CGP-NEXT: v_xor_b32_e32 v18, v9, v17 +; CGP-NEXT: v_mul_hi_u32 v9, v4, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v18, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v4, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v16, v14 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v18, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v17, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v14, v8 -; CGP-NEXT: v_mul_hi_u32 v11, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v4 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v16, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v18, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v15, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v15, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v18, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v17, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v18, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v14, v8 +; CGP-NEXT: v_mul_hi_u32 v10, v15, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v11, v17, v8 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v13, 0 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v11, v[4:5] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v14, v8 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v13, v[9:10] -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v17, v11, vcc -; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v17, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v11, v18, v8 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v9 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v4, 0 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v11, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v0, v12, v[9:10] +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v1, v4, v[10:11] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v15, v8 +; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v18, v12, vcc +; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v18, v12 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v0 @@ -2766,10 +2738,10 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v16 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v16 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v17 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v17 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v17 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v17, vcc ; CGP-NEXT: ; implicit-def: $vgpr11_vgpr12 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: .LBB8_2: ; %Flow1 @@ -2827,76 +2799,74 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v6, v6 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v9, v13, v8 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v6, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v9, v6, v8 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[10:11] +; CGP-NEXT: v_mul_hi_u32 v10, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v10, v13, v11 -; CGP-NEXT: v_mul_lo_u32 v12, v6, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v11 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v4, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v6, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v4, v12 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v11, v6, v12 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v6, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5] +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v9, vcc +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v6, v[9:10] ; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v14 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v14, vcc -; CGP-NEXT: v_xor_b32_e32 v10, v4, v14 -; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v7, v13, v11 -; CGP-NEXT: v_xor_b32_e32 v12, v5, v14 -; CGP-NEXT: v_mul_hi_u32 v5, v13, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[10:11] +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v10, v5, v14 +; CGP-NEXT: v_mul_lo_u32 v5, v6, v8 +; CGP-NEXT: v_mul_lo_u32 v9, v4, v12 +; CGP-NEXT: v_xor_b32_e32 v11, v7, v14 +; CGP-NEXT: v_mul_hi_u32 v7, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v6, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v7, v13, v11 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v6, v12 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v9, v4, v12 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v6, v12 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v7, vcc +; CGP-NEXT: v_mul_lo_u32 v6, v11, v4 ; CGP-NEXT: v_mul_lo_u32 v7, v10, v5 ; CGP-NEXT: v_mul_hi_u32 v8, v10, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v11, v4 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v12, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v11, v5 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_mul_hi_u32 v7, v10, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 @@ -2904,17 +2874,17 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v6 -; CGP-NEXT: v_mul_hi_u32 v8, v12, v5 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v6 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v7, v11, v5 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v12, 0 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v7, v6 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6] ; CGP-NEXT: v_sub_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[6:7] -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v8, vcc -; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v8 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v12, v[6:7] +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v11, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v11, v8 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 @@ -3036,29 +3006,30 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0 -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 ; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GISEL-NEXT: v_trunc_f32_e32 v5, v4 -; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[7:8] -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3 -; GISEL-NEXT: v_mul_lo_u32 v8, v10, v4 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 -; GISEL-NEXT: v_mul_lo_u32 v14, v13, v4 +; GISEL-NEXT: v_trunc_f32_e32 v4, v4 +; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v4 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v10, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v3 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v3 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 @@ -3066,165 +3037,160 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v3 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] -; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0 -; GISEL-NEXT: v_mul_lo_u32 v0, v13, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 -; GISEL-NEXT: v_mul_lo_u32 v5, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v3 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v10, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v3 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v3 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v13, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GISEL-NEXT: v_mul_lo_u32 v8, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v13, v3, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v7 -; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_mul_lo_u32 v5, 0, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v12, 0 -; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v7, v[0:1] -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v8 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v9, v7 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v0 -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v9 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v14, v13, 0 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], 0, v12, v[5:6] -; GISEL-NEXT: v_mov_b32_e32 v0, v8 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v16, v[0:1] -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v11, v4 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v7 -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v10 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, v9, vcc -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], 0, v9 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v3 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, 0, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v0, v4 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v5, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v3 +; GISEL-NEXT: v_mul_hi_u32 v8, 0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v10, 0 +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v9 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v3 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[5:6] +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v11 +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v10, v[6:7] +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v11, 0 +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, v[6:7] +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v11, v[9:10] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, v11, v6 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v4, s[4:5] +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v5 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v7, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v8, v16, v10 -; GISEL-NEXT: v_mul_hi_u32 v7, v16, v7 -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 -; GISEL-NEXT: v_mul_hi_u32 v5, v13, v10 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v6 +; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v6 ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v16, v10 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v6, v12, v6 ; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v4 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v16, v5, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, 0 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 -; GISEL-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, -1, v6, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v10, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v9, v[5:6] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v6, v5 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v4 +; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], v12, v5, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, 0 +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v0, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v11, v[6:7] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v13, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v14, -1, v5, vcc -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v4 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v8 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GISEL-NEXT: v_cndmask_b32_e32 v9, -1, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v8 +; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v8 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v8 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v10, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, 0, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, v2, v6 -; GISEL-NEXT: v_mul_hi_u32 v9, v2, v4 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v13, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v7, 0, v6 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, 0, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v2, v5 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v4 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, 0, v5 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v4, v5 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0 -; GISEL-NEXT: v_mul_hi_u32 v7, 0, v6 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v8, v13, v8, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v12, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v9, v[5:6] +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v4, v6 +; GISEL-NEXT: v_mul_hi_u32 v12, 0, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v11, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[5:6] +; GISEL-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v11, v[6:7] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v7 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v7, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v8 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v8, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v3 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 23ef596c021c2..c50b491bcb074 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -129,11 +129,11 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX8-NEXT: v_trunc_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -203,11 +203,11 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, s9, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v3, v2 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2] ; GFX8-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] @@ -268,11 +268,11 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -468,31 +468,31 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v0, s0, v5, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_mul_hi_u32 v5, s17, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2 ; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v5, s0, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, s17, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s18, v5, 0 -; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2 +; GFX10-NEXT: v_add_co_u32 v6, s0, v0, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s18, v6, 0 +; GFX10-NEXT: v_add3_u32 v3, v2, v3, v5 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s18, v3, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s19, v5, v[1:2] -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s19, v6, v[1:2] +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v6, 1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s16, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s17, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, s17, v1 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s0, s17, v1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v6, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v5, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s18, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s18 +; GFX10-NEXT: v_sub_co_u32 v5, vcc_lo, v7, s18 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v0, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s19, v8 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s18, v6 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s18, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s19, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 @@ -503,18 +503,18 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s19, v8 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v10, v1, s0 -; GFX10-NEXT: v_sub_co_u32 v10, s0, v6, s18 +; GFX10-NEXT: v_sub_co_u32 v10, s0, v5, s18 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v0, s0, 0, v0, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v9, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v5, s0 ; GFX10-NEXT: global_store_dwordx2 v9, v[0:1], s[12:13] ; GFX10-NEXT: global_store_dwordx2 v9, v[2:3], s[14:15] ; GFX10-NEXT: s_endpgm @@ -1005,14 +1005,14 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v10, s13 +; GFX8-NEXT: v_mov_b32_e32 v9, s13 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX8-NEXT: v_trunc_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -1042,12 +1042,10 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 -; GFX8-NEXT: s_sub_u32 s2, 0, s14 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] ; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 ; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4 -; GFX8-NEXT: s_subb_u32 s3, 0, s15 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 @@ -1084,112 +1082,113 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, s9, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v8, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v9, v[1:2] +; GFX8-NEXT: v_add_u32_e64 v17, s[2:3], 1, v8 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v3, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v10, v[1:2] ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s8, v0 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s13, v8, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s14 ; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v2, v4, vcc ; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s9, v4 +; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s15 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v1 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX8-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 +; GFX8-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v3, v4, s[0:1] -; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s15 -; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s14 -; GFX8-NEXT: v_subb_u32_e32 v12, vcc, v2, v10, vcc -; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: v_subrev_u32_e32 v13, vcc, s12, v1 -; GFX8-NEXT: v_subbrev_u32_e64 v14, s[0:1], 0, v12, vcc -; GFX8-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; GFX8-NEXT: v_subb_u32_e32 v12, vcc, v2, v9, vcc +; GFX8-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v3, v6, s[0:1] ; GFX8-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GFX8-NEXT: v_trunc_f32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4 -; GFX8-NEXT: v_add_f32_e32 v2, v3, v2 -; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v2 -; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v8 -; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v9, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v18, v4 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v13 +; GFX8-NEXT: v_trunc_f32_e32 v3, v3 +; GFX8-NEXT: v_mul_f32_e32 v4, 0xcf800000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v4, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v13, v2 +; GFX8-NEXT: s_sub_u32 s8, 0, s14 +; GFX8-NEXT: v_cvt_u32_f32_e32 v14, v3 +; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s12, v1 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v13, 0 +; GFX8-NEXT: v_subbrev_u32_e64 v16, s[0:1], 0, v12, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v14, v[3:4] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v16 +; GFX8-NEXT: s_subb_u32 s9, 0, s15 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v15, v[4:5] -; GFX8-NEXT: v_mul_lo_u32 v4, v18, v2 -; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v12, v10, vcc -; GFX8-NEXT: v_mul_lo_u32 v5, v15, v6 -; GFX8-NEXT: v_mul_hi_u32 v10, v15, v2 -; GFX8-NEXT: v_mul_hi_u32 v2, v18, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v14 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v10, v18, v6 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 -; GFX8-NEXT: v_mul_hi_u32 v5, v15, v6 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v10, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s9, v13, v[4:5] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v15 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v5, v14, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, v13, v6 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; GFX8-NEXT: v_mul_hi_u32 v4, v13, v2 +; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v5, v4 +; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v12, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] +; GFX8-NEXT: v_mul_hi_u32 v2, v14, v2 +; GFX8-NEXT: v_mul_lo_u32 v9, v14, v6 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 +; GFX8-NEXT: v_mul_hi_u32 v7, v13, v6 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 +; GFX8-NEXT: v_addc_u32_e64 v18, s[2:3], 0, v10, s[2:3] +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 1, v17 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v18, vcc +; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v15 +; GFX8-NEXT: v_mul_hi_u32 v6, v14, v6 +; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v10, v5 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 1, v16 -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v17, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v13 -; GFX8-NEXT: v_mul_hi_u32 v6, v18, v6 -; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v7, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v2 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, v18, v4, vcc -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, 0 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v2 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v14, v4, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v13, 0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v10, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, v5 -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v18, v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v17, v9, vcc +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s8, v14, v[5:6] ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v3, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[2:3], s3, v15, v[5:6] -; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v12, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v18, v4 -; GFX8-NEXT: v_mul_lo_u32 v8, v15, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v10, s[0:1] -; GFX8-NEXT: v_mul_hi_u32 v9, v15, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v19, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v10, v14, v20, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s9, v13, v[6:7] +; GFX8-NEXT: v_mul_lo_u32 v6, v14, v4 +; GFX8-NEXT: v_mul_hi_u32 v9, v13, v4 +; GFX8-NEXT: v_mul_lo_u32 v7, v13, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v18, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v15, v19, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v10, v16, v20, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v9, v18, v7 -; GFX8-NEXT: v_mul_hi_u32 v4, v18, v4 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; GFX8-NEXT: v_mul_hi_u32 v8, v15, v7 +; GFX8-NEXT: v_mul_lo_u32 v9, v14, v8 +; GFX8-NEXT: v_mul_hi_u32 v4, v14, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 +; GFX8-NEXT: v_mul_hi_u32 v7, v13, v8 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v9, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v9, v8 -; GFX8-NEXT: v_mul_hi_u32 v7, v18, v7 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 +; GFX8-NEXT: v_mul_hi_u32 v8, v14, v8 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v15, v4 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v18, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v13, v4 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v14, v6, vcc ; GFX8-NEXT: v_mul_lo_u32 v8, s11, v4 ; GFX8-NEXT: v_mul_lo_u32 v9, s10, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[0:1] @@ -1206,15 +1205,14 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v5, v8 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v8 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v4, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_mul_hi_u32 v7, s11, v7 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s14, v11, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v8, v1 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s14, v12, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s14, v12, v[5:6] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v0, v10, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, s15 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s15, v11, v[8:9] @@ -1274,13 +1272,16 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, s5 +; GFX9-NEXT: s_sub_u32 s8, 0, s6 +; GFX9-NEXT: s_subb_u32 s9, 0, s7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -1307,15 +1308,12 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, s5 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 -; GFX9-NEXT: s_sub_u32 s2, 0, s6 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] ; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4 -; GFX9-NEXT: s_subb_u32 s3, 0, s7 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 @@ -1350,134 +1348,132 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v9, 0 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v8, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add3_u32 v10, v3, v2, v5 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s4, v10, v[1:2] ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s16, v0 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v9, v[2:3] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v8, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 ; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v2, v4, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, s17, v4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s7 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s17, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 +; GFX9-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v4, s[0:1] -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v8, vcc -; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: v_subrev_co_u32_e32 v13, vcc, s4, v1 -; GFX9-NEXT: v_subbrev_co_u32_e64 v14, s[0:1], 0, v12, vcc -; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v9, vcc +; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v6, s[0:1] ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GFX9-NEXT: v_trunc_f32_e32 v4, v3 -; GFX9-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4 -; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v2 -; GFX9-NEXT: v_add_co_u32_e64 v16, s[0:1], 1, v9 -; GFX9-NEXT: v_addc_co_u32_e64 v17, s[0:1], 0, v10, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v4 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v13 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v4, 0xcf800000, v3 +; GFX9-NEXT: v_add_f32_e32 v2, v4, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v3 +; GFX9-NEXT: v_subrev_co_u32_e32 v15, vcc, s4, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v13, 0 +; GFX9-NEXT: v_subbrev_co_u32_e64 v16, s[0:1], 0, v12, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v14, v[3:4] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v16 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v15, v[4:5] -; GFX9-NEXT: v_mul_lo_u32 v4, v18, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v12, v8, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v15, v6 -; GFX9-NEXT: v_mul_hi_u32 v8, v15, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, v18, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v14 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v18, v6 -; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 -; GFX9-NEXT: v_mul_hi_u32 v5, v15, v6 -; GFX9-NEXT: v_mul_hi_u32 v6, v18, v6 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s9, v13, v[4:5] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v15 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v5, v14, v2 +; GFX9-NEXT: v_mul_lo_u32 v7, v13, v6 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v16 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v4, v13, v2 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v5, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v12, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v2, v14, v2 +; GFX9-NEXT: v_mul_lo_u32 v9, v14, v6 +; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, v13, v6 +; GFX9-NEXT: v_add_co_u32_e64 v17, s[2:3], 1, v8 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v18, s[2:3], 0, v10, s[2:3] +; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 1, v17 +; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v18, vcc +; GFX9-NEXT: v_mul_hi_u32 v6, v14, v6 +; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v15 +; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v5, v8, v5 -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 1, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v17, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v4, v5, v4, v6 -; GFX9-NEXT: v_add_co_u32_e32 v15, vcc, v15, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1] -; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, v18, v4, vcc -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, 0 -; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v13 -; GFX9-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v18, v[2:3] -; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v7, vcc +; GFX9-NEXT: v_add3_u32 v4, v7, v4, v6 +; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v4, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v13, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v8, vcc -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s3, v15, v[5:6] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v9, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s8, v14, v[5:6] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 -; GFX9-NEXT: v_mul_lo_u32 v5, v18, v4 -; GFX9-NEXT: v_mul_lo_u32 v6, v15, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v9, v15, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v18, v4 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[2:3], v5, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v5, s[2:3], v5, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v9, v18, v7 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_mul_hi_u32 v6, v15, v7 -; GFX9-NEXT: v_mul_hi_u32 v7, v18, v7 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v9, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v5 -; GFX9-NEXT: v_add_u32_e32 v6, v9, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3] -; GFX9-NEXT: v_add3_u32 v5, v6, v5, v7 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v15, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[2:3], v18, v5, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v6, s19, v4 -; GFX9-NEXT: v_mul_lo_u32 v7, s18, v5 -; GFX9-NEXT: v_mul_hi_u32 v9, s18, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s9, v13, v[6:7] +; GFX9-NEXT: v_mul_lo_u32 v6, v14, v4 +; GFX9-NEXT: v_mul_hi_u32 v9, v13, v4 +; GFX9-NEXT: v_mul_lo_u32 v7, v13, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v8, v13, v19, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v20, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v15, v19, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v20, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, s19, v5 -; GFX9-NEXT: v_mul_hi_u32 v4, s19, v4 +; GFX9-NEXT: v_mul_lo_u32 v9, v14, v8 +; GFX9-NEXT: v_mul_hi_u32 v4, v14, v4 ; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX9-NEXT: v_mul_hi_u32 v7, s18, v5 -; GFX9-NEXT: v_mul_hi_u32 v13, s19, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, v13, v8 +; GFX9-NEXT: v_mul_hi_u32 v8, v14, v8 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v9, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v4, v6 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 +; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v6, v7, v6, v8 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v13, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, s19, v4 +; GFX9-NEXT: v_mul_lo_u32 v9, s18, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v1, s18, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, s19, v4 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v8, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v5, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v5, s19, v7 +; GFX9-NEXT: v_add_u32_e32 v1, v8, v1 +; GFX9-NEXT: v_mul_hi_u32 v8, s18, v7 +; GFX9-NEXT: v_mul_hi_u32 v7, s19, v7 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v4, v1 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s6, v11, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v1, v8, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v1, v9, v7 -; GFX9-NEXT: v_add3_u32 v12, v1, v12, v13 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s6, v12, v[1:2] +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 +; GFX9-NEXT: v_add3_u32 v12, v8, v1, v7 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s6, v12, v[5:6] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v0, v10, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s7, v11, v[8:9] @@ -1546,14 +1542,14 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x2f800000, v1 -; GFX10-NEXT: v_trunc_f32_e32 v4, v2 -; GFX10-NEXT: v_trunc_f32_e32 v5, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, 0xcf800000, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, 0xcf800000, v5 -; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v4 -; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v5 -; GFX10-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_trunc_f32_e32 v2, v2 +; GFX10-NEXT: v_trunc_f32_e32 v4, v3 +; GFX10-NEXT: v_mul_f32_e32 v3, 0xcf800000, v2 +; GFX10-NEXT: v_mul_f32_e32 v5, 0xcf800000, v4 +; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v2 +; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v4 +; GFX10-NEXT: v_add_f32_e32 v0, v3, v0 +; GFX10-NEXT: v_add_f32_e32 v1, v5, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v1 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s1, v7, 0 @@ -1662,119 +1658,119 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v4, s17, v4 ; GFX10-NEXT: v_mul_lo_u32 v9, s17, v2 ; GFX10-NEXT: v_mul_lo_u32 v6, s19, v1 -; GFX10-NEXT: v_mul_hi_u32 v10, s16, v2 -; GFX10-NEXT: v_mul_hi_u32 v11, s17, v2 -; GFX10-NEXT: v_mul_lo_u32 v2, s18, v0 +; GFX10-NEXT: v_mul_lo_u32 v11, s18, v0 ; GFX10-NEXT: v_mul_hi_u32 v7, s18, v1 ; GFX10-NEXT: v_mul_hi_u32 v1, s19, v1 ; GFX10-NEXT: v_mul_lo_u32 v12, s19, v0 -; GFX10-NEXT: v_mul_hi_u32 v13, s18, v0 -; GFX10-NEXT: v_mul_hi_u32 v14, s19, v0 -; GFX10-NEXT: v_add_co_u32 v0, s0, v3, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v4, s0, v9, v4 +; GFX10-NEXT: v_add_co_u32 v3, s0, v3, v8 +; GFX10-NEXT: v_mul_hi_u32 v10, s16, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v2, s0, v6, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v1, s0, v12, v1 +; GFX10-NEXT: v_add_co_u32 v4, s0, v9, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v1, s0, v12, v1 +; GFX10-NEXT: v_mul_hi_u32 v13, s18, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v3, s0, v3, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v3, v0 +; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v8, v5 -; GFX10-NEXT: v_add_co_u32 v8, s0, v4, v0 +; GFX10-NEXT: v_mul_hi_u32 v10, s19, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v8, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v9, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v11, v6 +; GFX10-NEXT: v_mul_hi_u32 v2, s17, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v12, v7 +; GFX10-NEXT: v_add_co_u32 v7, s0, v4, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v10, s0, v1, v2 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s4, v8, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s6, v10, 0 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v9, v7 -; GFX10-NEXT: v_add3_u32 v9, v5, v4, v11 -; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, v8, 1 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 -; GFX10-NEXT: v_add3_u32 v7, v7, v6, v14 +; GFX10-NEXT: v_add_co_u32 v8, s0, v1, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s4, v7, 0 +; GFX10-NEXT: v_add3_u32 v9, v3, v4, v2 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s6, v8, 0 +; GFX10-NEXT: v_add3_u32 v10, v6, v5, v10 +; GFX10-NEXT: v_mov_b32_e32 v12, 0 ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s4, v9, v[1:2] -; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v9, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s6, v7, v[3:4] -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s5, v8, v[4:5] -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v12, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v13, vcc_lo +; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v7, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v9, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s6, v10, v[3:4] +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v1, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v11, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s16, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s7, v10, v[5:6] -; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s17, v3, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v14 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, s17, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, -1, s0 -; GFX10-NEXT: v_sub_co_u32 v15, s0, s18, v2 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s1, s19, v0, s0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v15 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s19, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v17, vcc_lo, v14, s4 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v1, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v5 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v23, s0, s7, v0, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, v18 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s5, v7, v[4:5] +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s7, v8, v[5:6] +; GFX10-NEXT: v_sub_nc_u32_e32 v5, s17, v3 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v3, s0, s17, v3, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s5, v5, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v15, vcc_lo, v14, s4 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s0, 0, v0, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v17, s0, s18, v2 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v18, s1, s19, v4, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s4, v15 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s5, v0, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v4, s19, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s1 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s4, v17 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v18 -; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s7, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v21, v20, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, -1, s1 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX10-NEXT: v_sub_co_u32 v0, s0, v17, s4 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v19, s0, 0, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v6, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, s1, v15, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v17, v0, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v12, s2, 0, v23, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v22, v2, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v6 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v19, v2, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, v3 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v20, v5, s1 +; GFX10-NEXT: v_sub_co_u32 v2, s1, v15, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0, v5 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s1, 0, v0, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s7, v18 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v1, s2 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v7, s0, s7, v4, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v17 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v15, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0 +; GFX10-NEXT: v_sub_co_u32 v11, s0, v17, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v13, s1, 0, v7, s0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v18 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v2, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v9, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v13, vcc_lo, v10, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, 0, v7, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo -; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v13, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, 0, v14, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, vcc_lo, s7, v23, s1 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_sub_co_u32 v8, s1, v6, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v18, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v13, v14, v17, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v8, v12, v18, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v9, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v13, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v15, v6, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v8, s1 -; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[12:13] -; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[14:15] +; GFX10-NEXT: v_add_co_u32 v14, vcc_lo, v8, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, 0, v10, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v13 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v14, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, 0, v15, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v7, vcc_lo, s7, v7, s0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v5 +; GFX10-NEXT: v_sub_co_u32 v5, s0, v11, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v7, s0, 0, v7, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v14, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v13, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v3, v6, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v10, v14, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v17, v11, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v18, v7, s0 +; GFX10-NEXT: global_store_dwordx4 v12, v[0:3], s[12:13] +; GFX10-NEXT: global_store_dwordx4 v12, v[4:7], s[14:15] ; GFX10-NEXT: s_endpgm %div = udiv <2 x i64> %x, %y store <2 x i64> %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 6a95881067b93..ff74d1f71616d 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -2147,12 +2147,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3] +; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s4, v2, s[2:3] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2] -; GFX1164-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, v4 +; GFX1164-NEXT: v_mad_u64_u32 v[4:5], null, s5, v2, v[0:1] +; GFX1164-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i64_uniform: @@ -2190,12 +2190,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3] +; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s4, v2, s[2:3] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2] -; GFX1132-NEXT: v_mov_b32_e32 v1, v3 -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: v_mov_b32_e32 v0, v4 +; GFX1132-NEXT: v_mad_u64_u32 v[4:5], null, s5, v2, v[0:1] +; GFX1132-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm ; ; GFX1264-LABEL: add_i64_uniform: @@ -6208,10 +6208,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s8, v4, 0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s9, v4, v[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s9, v4, v[3:4] ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s7, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 08a4f0cdad18f..f5ca24f59a286 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1889,13 +1889,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1164-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, s[4:5] ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] +; GFX1164-NEXT: v_mov_b32_e32 v0, v4 +; GFX1164-NEXT: v_mad_u64_u32 v[4:5], null, s3, v2, v[0:1] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i64_uniform: @@ -1926,13 +1926,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1132-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, s[4:5] ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] +; GFX1132-NEXT: v_mov_b32_e32 v0, v4 +; GFX1132-NEXT: v_mad_u64_u32 v[4:5], null, s3, v2, v[0:1] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: v_mov_b32_e32 v1, v3 -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %additive acq_rel @@ -5182,13 +5182,12 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX9-NEXT: .LBB13_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[8:9], s2, v2, 0 -; GFX9-NEXT: v_readfirstlane_b32 s8, v0 ; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0 ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v2, v[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v2, v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s8, v3 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll index 142290a39f8f4..361bc78759bfa 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll @@ -2382,17 +2382,17 @@ define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) { ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, v2 -; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v2 ; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; SDAG-NEXT: v_lshrrev_b32_e32 v14, 8, v0 ; SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] ; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v2 ; SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; SDAG-NEXT: v_mov_b32_e32 v4, v1 +; SDAG-NEXT: v_mov_b32_e32 v8, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v14 ; SDAG-NEXT: v_mov_b32_e32 v2, v13 ; SDAG-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll index d7d697ef85b9f..00baf0a44368d 100644 --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -1026,102 +1026,100 @@ define i64 @sdiv64_known32(i64 %a, i64 %b) { ; GFX9-NEXT: v_or_b32_e32 v5, v2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v3 ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v0 -; GFX9-NEXT: v_sub_co_u32_e32 v11, vcc, 0, v6 -; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, 0, v0, vcc -; GFX9-NEXT: v_madmk_f32 v1, v3, 0x4f800000, v1 -; GFX9-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v1 -; GFX9-NEXT: v_trunc_f32_e32 v3, v3 -; GFX9-NEXT: v_madmk_f32 v1, v3, 0xcf800000, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_lo_u32 v5, v11, v10 -; GFX9-NEXT: v_mul_lo_u32 v8, v12, v1 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v1, 0 -; GFX9-NEXT: v_add3_u32 v8, v4, v5, v8 -; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v8, 0 -; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v9, v4 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v8, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v13, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v10, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v11, v13 -; GFX9-NEXT: v_mul_lo_u32 v8, v12, v1 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v1, 0 -; GFX9-NEXT: v_add3_u32 v8, v4, v5, v8 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v8, 0 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, 0 -; GFX9-NEXT: v_mul_hi_u32 v12, v1, v3 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v3, 0 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v12, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v11, vcc +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v0 +; GFX9-NEXT: v_sub_co_u32_e32 v10, vcc, 0, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v0, vcc +; GFX9-NEXT: v_madmk_f32 v4, v5, 0x4f800000, v4 +; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_madmk_f32 v4, v5, 0xcf800000, v4 +; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v8 +; GFX9-NEXT: v_mul_lo_u32 v7, v11, v9 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 +; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 +; GFX9-NEXT: v_mul_hi_u32 v12, v9, v4 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v5 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v6, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v4, vcc -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v5, 0 -; GFX9-NEXT: v_mul_hi_u32 v8, v7, v1 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v8, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v4, vcc -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v1, 0 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v5, 0 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v10, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v0, v1 -; GFX9-NEXT: v_mul_lo_u32 v9, v6, v5 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v1, 0 -; GFX9-NEXT: v_add3_u32 v4, v4, v9, v8 -; GFX9-NEXT: v_sub_u32_e32 v8, v2, v4 -; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, v7, v3 -; GFX9-NEXT: v_subb_co_u32_e64 v7, s[4:5], v8, v0, vcc -; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v3, v6 -; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[4:5], 0, v7, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v0 +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13 +; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0 +; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0 +; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v7, 0 +; GFX9-NEXT: v_mul_hi_u32 v8, v1, v6 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v5, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v0, v6 +; GFX9-NEXT: v_mul_lo_u32 v9, v3, v7 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v6, 0 +; GFX9-NEXT: v_add3_u32 v5, v5, v9, v8 +; GFX9-NEXT: v_sub_u32_e32 v8, v2, v5 +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v4 +; GFX9-NEXT: v_subb_co_u32_e64 v4, s[4:5], v8, v0, vcc +; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v1, v3 +; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6 +; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v8, s[4:5] -; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], 2, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, v5, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v9, v8, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], 2, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, v7, s[4:5] ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 -; GFX9-NEXT: v_add_co_u32_e64 v10, s[4:5], 1, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GFX9-NEXT: v_add_co_u32_e64 v10, s[4:5], 1, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v0 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v9, s[4:5] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v11, v9, s[4:5] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v10, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v0, vcc ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: .LBB10_2: ; %Flow diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll index 613fdf388c0f1..0f45e99dd76c4 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll @@ -64,13 +64,11 @@ define <2 x i32> @mullohi_2xu32(<2 x i32> %arg, <2 x i32> %arg1, ptr %arg2) { ; CHECK-LABEL: mullohi_2xu32: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v6, v1 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v1, 0 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, v1 -; CHECK-NEXT: v_mov_b32_e32 v7, v3 -; CHECK-NEXT: v_mov_b32_e32 v1, v2 -; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[6:7] +; CHECK-NEXT: v_mov_b32_e32 v2, v7 +; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[1:2] +; CHECK-NEXT: v_mov_b32_e32 v1, v6 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll index 54cbc25043db3..e841ec43fd064 100644 --- a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll +++ b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll @@ -193,14 +193,13 @@ define noundef i64 @urem64_3(i64 noundef %i) { ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3] ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3] -; GFX9-NEXT: v_alignbit_b32 v2, v3, v2, 1 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, 3, 0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, 3, v[2:3] -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, s6, v[2:3] +; GFX9-NEXT: v_alignbit_b32 v2, v4, v3, 1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, 3, 0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, 3, v[3:4] +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: urem64_3: @@ -238,14 +237,13 @@ define noundef i64 @urem64_3(i64 noundef %i) { ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[2:3] ; GFX1030-NEXT: v_add_co_u32 v2, s4, v5, v3 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v1, v[2:3] -; GFX1030-NEXT: v_alignbit_b32 v2, v3, v2, 1 -; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v2, 3, 0 -; GFX1030-NEXT: v_mov_b32_e32 v2, v5 -; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v3, 3, v[2:3] -; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v2, vcc_lo +; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0xaaaaaaaa, v1, v[2:3] +; GFX1030-NEXT: v_alignbit_b32 v2, v4, v3, 1 +; GFX1030-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v2, 3, 0 +; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, v4, 3, v[3:4] +; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo ; GFX1030-NEXT: s_setpc_b64 s[30:31] entry: %rem = urem i64 %i, 3 @@ -265,14 +263,13 @@ define noundef i64 @urem64_6(i64 noundef %i) { ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3] ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3] -; GFX9-NEXT: v_alignbit_b32 v2, v3, v2, 2 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, 6, 0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, 6, v[2:3] -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, s6, v[2:3] +; GFX9-NEXT: v_alignbit_b32 v2, v4, v3, 2 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, 6, 0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, 6, v[3:4] +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: urem64_6: @@ -310,14 +307,13 @@ define noundef i64 @urem64_6(i64 noundef %i) { ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[2:3] ; GFX1030-NEXT: v_add_co_u32 v2, s4, v5, v3 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v1, v[2:3] -; GFX1030-NEXT: v_alignbit_b32 v2, v3, v2, 2 -; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v2, 6, 0 -; GFX1030-NEXT: v_mov_b32_e32 v2, v5 -; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v3, 6, v[2:3] -; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v2, vcc_lo +; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0xaaaaaaaa, v1, v[2:3] +; GFX1030-NEXT: v_alignbit_b32 v2, v4, v3, 2 +; GFX1030-NEXT: v_lshrrev_b32_e32 v4, 2, v4 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v2, 6, 0 +; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, v4, 6, v[3:4] +; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo ; GFX1030-NEXT: s_setpc_b64 s[30:31] entry: %rem = urem i64 %i, 6 diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 26f77898faf60..ddac86b3719c2 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -1953,68 +1953,66 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_mul_lo_u32 v14, v33, v3 ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v33, v2, 0 -; SDAG-NEXT: v_mul_lo_u32 v24, v27, v2 -; SDAG-NEXT: v_mul_lo_u32 v25, v34, v31 -; SDAG-NEXT: v_mul_lo_u32 v34, v32, v30 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v31, v33, 0 -; SDAG-NEXT: v_mov_b32_e32 v15, 0 -; SDAG-NEXT: v_mul_lo_u32 v38, v12, v7 -; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v6, 0 -; SDAG-NEXT: v_mul_lo_u32 v39, v13, v6 +; SDAG-NEXT: v_mul_lo_u32 v15, v27, v2 +; SDAG-NEXT: v_mul_lo_u32 v23, v34, v31 +; SDAG-NEXT: v_mul_lo_u32 v24, v32, v30 +; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v31, v33, 0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mul_lo_u32 v25, v12, v7 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v6, 0 +; SDAG-NEXT: v_mul_lo_u32 v34, v13, v6 ; SDAG-NEXT: v_mul_lo_u32 v19, v19, v37 -; SDAG-NEXT: v_mul_lo_u32 v48, v18, v36 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v37, v12, 0 +; SDAG-NEXT: v_mul_lo_u32 v38, v18, v36 ; SDAG-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; SDAG-NEXT: v_mov_b32_e32 v14, v3 -; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v30, v33, v[14:15] -; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v2 -; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v21, v38 -; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v24 -; SDAG-NEXT: v_mov_b32_e32 v14, v22 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v31, v27, v[14:15] -; SDAG-NEXT: v_xor_b32_e32 v24, v16, v28 -; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v21, v39 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v30, v33, v[21:22] +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v20 +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v3, v25 +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 +; SDAG-NEXT: v_mov_b32_e32 v21, v6 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v31, v27, v[21:22] +; SDAG-NEXT: v_xor_b32_e32 v16, v16, v28 +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v3, v34 ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v32, v31, v[10:11] -; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], v23, v3 -; SDAG-NEXT: v_addc_u32_e64 v23, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v31, vcc, v17, v2, vcc -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[20:21] -; SDAG-NEXT: v_mov_b32_e32 v14, v7 -; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v36, v12, v[14:15] -; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], v25, v11 -; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v30, v27, v[22:23] -; SDAG-NEXT: v_xor_b32_e32 v18, v31, v29 +; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v7, v15 +; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v17, v14, vcc +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v37, v12, 0 +; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v23, v11 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v30, v27, v[6:7] +; SDAG-NEXT: v_xor_b32_e32 v17, v14, v29 ; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v19, v3 -; SDAG-NEXT: v_mov_b32_e32 v14, v16 -; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v37, v13, v[14:15] -; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], v34, v7 -; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v48, v3 -; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v17, v15 -; SDAG-NEXT: v_addc_u32_e64 v16, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v0, v10, vcc -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v36, v13, v[15:16] -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc -; SDAG-NEXT: v_xor_b32_e32 v7, v0, v28 -; SDAG-NEXT: v_add_i32_e32 v10, vcc, v10, v2 -; SDAG-NEXT: v_addc_u32_e32 v11, vcc, v11, v3, vcc -; SDAG-NEXT: v_xor_b32_e32 v3, v1, v29 -; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v24, v28 -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v18, v29, vcc -; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v7, v28, vcc -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v29, vcc -; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v14, vcc -; SDAG-NEXT: v_xor_b32_e32 v6, v6, v26 -; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v10, vcc -; SDAG-NEXT: v_xor_b32_e32 v7, v7, v35 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v11, vcc -; SDAG-NEXT: v_xor_b32_e32 v8, v4, v26 +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v36, v12, v[21:22] +; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], v24, v15 +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v38, v3 +; SDAG-NEXT: v_mov_b32_e32 v21, v11 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v37, v13, v[21:22] +; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 +; SDAG-NEXT: v_addc_u32_e64 v10, s[4:5], v7, v18, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v0, v6, vcc +; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v12, v15 +; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v10, vcc +; SDAG-NEXT: v_xor_b32_e32 v10, v0, v28 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v36, v13, v[6:7] +; SDAG-NEXT: v_xor_b32_e32 v11, v1, v29 +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v16, v28 +; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v6, v2 +; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], v7, v3, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v17, v29, vcc +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v10, v28, vcc +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v11, v29, vcc +; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v20 +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v14, vcc +; SDAG-NEXT: v_xor_b32_e32 v8, v8, v26 +; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v6, vcc +; SDAG-NEXT: v_xor_b32_e32 v6, v9, v35 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v7, vcc +; SDAG-NEXT: v_xor_b32_e32 v7, v4, v26 ; SDAG-NEXT: v_xor_b32_e32 v9, v5, v35 -; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v6, v26 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v7, v35, vcc -; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v8, v26, vcc +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v8, v26 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v6, v35, vcc +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v7, v26, vcc ; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v35, vcc ; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -2407,44 +2405,41 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0 -; GISEL-NEXT: v_mul_lo_u32 v27, v30, v19 -; GISEL-NEXT: v_mul_lo_u32 v36, v29, v18 -; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v35, v20, 0 -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v2, 0 -; GISEL-NEXT: v_mul_lo_u32 v37, v35, v3 -; GISEL-NEXT: v_mul_lo_u32 v38, v34, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v29, v32, v[14:15] -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[22:23] -; GISEL-NEXT: v_mov_b32_e32 v22, v19 -; GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v10, v31, v[2:3] -; GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v4, v20, v[14:15] -; GISEL-NEXT: v_mov_b32_e32 v2, v23 +; GISEL-NEXT: v_mul_lo_u32 v26, v30, v19 +; GISEL-NEXT: v_mul_lo_u32 v27, v29, v18 +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v20, 0 +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v35, v2, 0 +; GISEL-NEXT: v_mul_lo_u32 v36, v35, v3 +; GISEL-NEXT: v_mul_lo_u32 v37, v34, v2 +; GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v29, v32, v[14:15] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[18:19] +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v31, v[24:25] +; GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v4, v20, v[14:15] ; GISEL-NEXT: v_mad_u64_u32 v[14:15], vcc, v30, v32, v[1:2] -; GISEL-NEXT: v_mov_b32_e32 v23, v25 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v35, v21, v[22:23] -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v29, v31, v[14:15] -; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v24, v27, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v35, v21, v[23:24] +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v29, v31, v[14:15] +; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v3, v26, s[6:7] ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v34, v20, v[1:2] -; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], v26, v37, s[6:7] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v3, v36, vcc +; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], v25, v36, s[6:7] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v3, v27, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v17, v22, vcc -; GISEL-NEXT: v_xor_b32_e32 v19, v0, v28 -; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v2, v38, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v12, v18 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v17, v18, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v28 +; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v2, v37, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v12, v22 ; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v14, s[4:5] -; GISEL-NEXT: v_xor_b32_e32 v18, v2, v33 +; GISEL-NEXT: v_xor_b32_e32 v22, v2, v33 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v10, v32, v[1:2] ; GISEL-NEXT: v_xor_b32_e32 v1, v16, v28 ; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v4, v21, v[0:1] ; GISEL-NEXT: v_xor_b32_e32 v10, v14, v33 ; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v11, v31, v[2:3] -; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v19, v28 +; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v18, v28 ; GISEL-NEXT: v_subb_u32_e64 v1, s[6:7], v1, v28, s[6:7] ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v5, v20, v[12:13] -; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v18, v33 +; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v22, v33 ; GISEL-NEXT: v_subb_u32_e64 v5, s[8:9], v10, v33, s[8:9] -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v8, v23, vcc +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v8, v19, vcc ; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v9, v16, vcc ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v28 ; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v15, s[4:5] @@ -2815,52 +2810,50 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 ; SDAG-NEXT: .LBB3_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] -; SDAG-NEXT: v_mul_lo_u32 v20, v32, v11 -; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v32, v10, 0 -; SDAG-NEXT: v_mul_lo_u32 v28, v30, v10 -; SDAG-NEXT: v_mul_lo_u32 v29, v33, v8 -; SDAG-NEXT: v_mul_lo_u32 v33, v31, v9 -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v32, 0 -; SDAG-NEXT: v_mov_b32_e32 v21, 0 -; SDAG-NEXT: v_mul_lo_u32 v34, v16, v15 -; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v14, 0 -; SDAG-NEXT: v_mul_lo_u32 v35, v17, v14 -; SDAG-NEXT: v_mul_lo_u32 v23, v23, v12 -; SDAG-NEXT: v_mul_lo_u32 v36, v22, v13 -; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v16, 0 -; SDAG-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; SDAG-NEXT: v_mov_b32_e32 v20, v11 -; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v9, v32, v[20:21] -; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v25, v34 -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v19, v28 -; SDAG-NEXT: v_mov_b32_e32 v20, v26 -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v30, v[20:21] -; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v25, v35 -; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v31, v8, v[18:19] -; SDAG-NEXT: v_add_i32_e64 v26, s[4:5], v27, v11 -; SDAG-NEXT: v_addc_u32_e64 v27, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v10, vcc -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[24:25] -; SDAG-NEXT: v_mov_b32_e32 v20, v15 -; SDAG-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v13, v16, v[20:21] -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v29, v19 -; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[26:27] -; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], v23, v11 -; SDAG-NEXT: v_mov_b32_e32 v20, v15 -; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v12, v17, v[20:21] -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v33, v19 -; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], v36, v22 -; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v16, v12 -; SDAG-NEXT: v_addc_u32_e64 v16, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v18 -; SDAG-NEXT: v_addc_u32_e64 v12, s[4:5], v9, v19, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v21, v32, v11 +; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v32, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v26, v30, v10 +; SDAG-NEXT: v_mul_lo_u32 v27, v33, v8 +; SDAG-NEXT: v_mul_lo_u32 v28, v31, v9 +; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v32, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mul_lo_u32 v29, v16, v15 +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0 +; SDAG-NEXT: v_mul_lo_u32 v33, v17, v14 +; SDAG-NEXT: v_mul_lo_u32 v34, v23, v12 +; SDAG-NEXT: v_mul_lo_u32 v35, v22, v13 +; SDAG-NEXT: v_add_i32_e32 v21, vcc, v25, v21 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v9, v32, v[19:20] +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v18 +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v29 +; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v21, v26 +; SDAG-NEXT: v_mov_b32_e32 v19, v14 +; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v30, v[19:20] +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v33 +; SDAG-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v31, v8, v[24:25] +; SDAG-NEXT: v_add_i32_e64 v14, s[4:5], v15, v19 +; SDAG-NEXT: v_addc_u32_e64 v15, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v18, vcc +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[10:11] +; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v16, 0 +; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v27, v24 +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[14:15] +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v34, v11 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v13, v16, v[19:20] +; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v28, v21 +; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v35, v11 +; SDAG-NEXT: v_mov_b32_e32 v19, v14 +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v12, v17, v[19:20] +; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v23 +; SDAG-NEXT: v_addc_u32_e64 v14, s[4:5], v9, v16, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v8, vcc -; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v17, v[15:16] -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v15, v12 +; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v17, v[8:9] ; SDAG-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc -; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v14 +; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v9, v21, vcc +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v18 ; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v11, vcc ; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v8, vcc ; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc @@ -3223,18 +3216,16 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v12, v18, 0 ; GISEL-NEXT: v_mul_lo_u32 v36, v12, v19 ; GISEL-NEXT: v_mul_lo_u32 v37, v13, v18 -; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v33, v[22:23] -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v13, v25, v[26:27] -; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v10, v32, v[18:19] -; GISEL-NEXT: v_mad_u64_u32 v[28:29], s[4:5], v14, v24, v[22:23] -; GISEL-NEXT: v_mov_b32_e32 v18, v26 -; GISEL-NEXT: v_mad_u64_u32 v[30:31], vcc, v8, v33, v[17:18] -; GISEL-NEXT: v_mov_b32_e32 v22, v28 +; GISEL-NEXT: v_mad_u64_u32 v[28:29], s[4:5], v9, v33, v[22:23] +; GISEL-NEXT: v_mad_u64_u32 v[30:31], s[4:5], v13, v25, v[26:27] +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v10, v32, v[28:29] +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v14, v24, v[30:31] +; GISEL-NEXT: v_mad_u64_u32 v[26:27], vcc, v8, v33, v[17:18] ; GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v12, v25, v[21:22] -; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v9, v32, v[30:31] -; GISEL-NEXT: v_addc_u32_e64 v12, s[6:7], v27, v34, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v9, v32, v[26:27] +; GISEL-NEXT: v_addc_u32_e64 v12, s[6:7], v19, v34, s[6:7] ; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v13, v24, v[17:18] -; GISEL-NEXT: v_addc_u32_e64 v13, s[6:7], v29, v36, s[6:7] +; GISEL-NEXT: v_addc_u32_e64 v13, s[6:7], v23, v36, s[6:7] ; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v35, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v21, vcc diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 460f1211d1386..0c4a15f6a9d5e 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -61,34 +61,32 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v5, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 -; SDAG-NEXT: v_mov_b32_e32 v4, v1 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[4:5] -; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v4, v6 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5] -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v7, v5 +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v12, v8, v2 +; SDAG-NEXT: v_mul_lo_u32 v13, v10, v3 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[1:2] +; SDAG-NEXT: v_add3_u32 v4, v4, v13, v12 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4] +; SDAG-NEXT: v_mov_b32_e32 v1, v5 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2] +; SDAG-NEXT: v_mul_lo_u32 v10, v9, v7 +; SDAG-NEXT: v_mul_lo_u32 v9, v9, v11 +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v9, v11 -; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6] +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6] +; SDAG-NEXT: v_add3_u32 v4, v9, v4, v10 ; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: v_add3_u32 v3, v9, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: .LBB0_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB0_6 @@ -119,10 +117,11 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB0_10: ; %fp-to-i-cleanup @@ -234,37 +233,36 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB0_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 -; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr9 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v2, 0xfffffbcd, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0 +; GISEL-NEXT: v_add_u32_e32 v3, 0xfffffb8d, v6 +; GISEL-NEXT: v_sub_u32_e32 v6, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[6:7], v6, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v11, v9, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v12, v8, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v9 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v10, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[5:6] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v10, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v12, v9, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v7, v8, v[5:6] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: ; implicit-def: $vgpr9 ; GISEL-NEXT: .LBB0_4: ; %Flow ; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[16:17] ; GISEL-NEXT: s_cbranch_execz .LBB0_6 @@ -276,16 +274,15 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v7, v1, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v6, v9, 0 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3] +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v7, v9, v[4:5] ; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9 -; GISEL-NEXT: v_mov_b32_e32 v2, v4 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], vcc, v6, v9, v[1:2] ; GISEL-NEXT: v_mul_lo_u32 v6, v6, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4] -; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[4:5] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v6, s[6:7] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: .LBB0_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] @@ -429,34 +426,32 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v5, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 -; SDAG-NEXT: v_mov_b32_e32 v4, v1 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[4:5] -; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v4, v6 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5] -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v7, v5 +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v12, v8, v2 +; SDAG-NEXT: v_mul_lo_u32 v13, v10, v3 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[1:2] +; SDAG-NEXT: v_add3_u32 v4, v4, v13, v12 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4] +; SDAG-NEXT: v_mov_b32_e32 v1, v5 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2] +; SDAG-NEXT: v_mul_lo_u32 v10, v9, v7 +; SDAG-NEXT: v_mul_lo_u32 v9, v9, v11 +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v9, v11 -; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6] +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6] +; SDAG-NEXT: v_add3_u32 v4, v9, v4, v10 ; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: v_add3_u32 v3, v9, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: .LBB1_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB1_6 @@ -487,10 +482,11 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB1_10: ; %fp-to-i-cleanup @@ -602,37 +598,36 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB1_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 -; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr9 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v2, 0xfffffbcd, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0 +; GISEL-NEXT: v_add_u32_e32 v3, 0xfffffb8d, v6 +; GISEL-NEXT: v_sub_u32_e32 v6, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[6:7], v6, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v11, v9, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v12, v8, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v9 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v10, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[5:6] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v10, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v12, v9, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v7, v8, v[5:6] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: ; implicit-def: $vgpr9 ; GISEL-NEXT: .LBB1_4: ; %Flow ; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[16:17] ; GISEL-NEXT: s_cbranch_execz .LBB1_6 @@ -644,16 +639,15 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v7, v1, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v6, v9, 0 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3] +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v7, v9, v[4:5] ; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9 -; GISEL-NEXT: v_mov_b32_e32 v2, v4 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], vcc, v6, v9, v[1:2] ; GISEL-NEXT: v_mul_lo_u32 v6, v6, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4] -; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[4:5] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v6, s[6:7] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: .LBB1_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] @@ -798,30 +792,30 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[7:8] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v11, 0 -; SDAG-NEXT: v_mul_lo_u32 v4, v9, v2 -; SDAG-NEXT: v_mul_lo_u32 v14, v11, v3 -; SDAG-NEXT: v_mov_b32_e32 v5, v1 -; SDAG-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mul_lo_u32 v7, v10, v13 -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v4 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v9, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v13, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v10, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v9, v[5:6] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v11, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v1, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v9, v2 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v11, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v12, v11, v3 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v11, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, v0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v9, v[5:6] +; SDAG-NEXT: v_add3_u32 v3, v3, v12, v8 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v7, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v0, s[4:5], v1, v6 +; SDAG-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v10, v13 +; SDAG-NEXT: v_mul_lo_u32 v7, v10, v7 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, v[0:1] ; SDAG-NEXT: ; implicit-def: $vgpr11 ; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: ; implicit-def: $vgpr10 -; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: v_add3_u32 v3, v7, v3, v8 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v0, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 ; SDAG-NEXT: .LBB2_4: ; %Flow @@ -836,9 +830,9 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v7, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v6, v2 ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, v[1:2] -; SDAG-NEXT: v_mov_b32_e32 v1, v5 -; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[1:2] +; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[5:6] ; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: .LBB2_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] @@ -847,10 +841,11 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB2_10: ; %fp-to-i-cleanup @@ -961,37 +956,36 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB2_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0 -; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v8 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v13, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff6a, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0 +; GISEL-NEXT: v_add_u32_e32 v3, 0xffffff2a, v6 +; GISEL-NEXT: v_sub_u32_e32 v6, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[6:7], v6, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v11, v8, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v12, v9, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v8 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v10, v8, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v8 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[5:6] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v10, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v12, v8, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v7, v9, v[5:6] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: .LBB2_4: ; %Flow ; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] ; GISEL-NEXT: s_cbranch_execz .LBB2_6 @@ -1004,11 +998,11 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GISEL-NEXT: v_mul_lo_u32 v7, v6, v8 -; GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], vcc, v6, v8, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GISEL-NEXT: v_mov_b32_e32 v2, v5 ; GISEL-NEXT: .LBB2_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: .LBB2_7: ; %Flow2 @@ -1152,30 +1146,30 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[7:8] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v11, 0 -; SDAG-NEXT: v_mul_lo_u32 v4, v9, v2 -; SDAG-NEXT: v_mul_lo_u32 v14, v11, v3 -; SDAG-NEXT: v_mov_b32_e32 v5, v1 -; SDAG-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mul_lo_u32 v7, v10, v13 -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v4 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v9, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v13, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v10, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v9, v[5:6] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v11, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v1, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v9, v2 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v11, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v12, v11, v3 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v11, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, v0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v9, v[5:6] +; SDAG-NEXT: v_add3_u32 v3, v3, v12, v8 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v7, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v0, s[4:5], v1, v6 +; SDAG-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v10, v13 +; SDAG-NEXT: v_mul_lo_u32 v7, v10, v7 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, v[0:1] ; SDAG-NEXT: ; implicit-def: $vgpr11 ; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: ; implicit-def: $vgpr10 -; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: v_add3_u32 v3, v7, v3, v8 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v0, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 ; SDAG-NEXT: .LBB3_4: ; %Flow @@ -1190,9 +1184,9 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v7, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v6, v2 ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, v[1:2] -; SDAG-NEXT: v_mov_b32_e32 v1, v5 -; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[1:2] +; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[5:6] ; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: .LBB3_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] @@ -1201,10 +1195,11 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB3_10: ; %fp-to-i-cleanup @@ -1315,37 +1310,36 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB3_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0 -; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v8 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v13, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff6a, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0 +; GISEL-NEXT: v_add_u32_e32 v3, 0xffffff2a, v6 +; GISEL-NEXT: v_sub_u32_e32 v6, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[6:7], v6, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v11, v8, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v12, v9, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v8 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v10, v8, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v8 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[5:6] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v10, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v12, v8, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v7, v9, v[5:6] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: .LBB3_4: ; %Flow ; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] ; GISEL-NEXT: s_cbranch_execz .LBB3_6 @@ -1358,11 +1352,11 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GISEL-NEXT: v_mul_lo_u32 v7, v6, v8 -; GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], vcc, v6, v8, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GISEL-NEXT: v_mov_b32_e32 v2, v5 ; GISEL-NEXT: .LBB3_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: .LBB3_7: ; %Flow2 @@ -1545,28 +1539,28 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[7:8] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, 0 -; SDAG-NEXT: v_mul_lo_u32 v4, v10, v2 -; SDAG-NEXT: v_mul_lo_u32 v14, v9, v3 -; SDAG-NEXT: v_mov_b32_e32 v5, v1 -; SDAG-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13 -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v4 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v10, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v13, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v11, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v1, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v10, v2 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v12, v9, v3 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v9, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, v0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[5:6] +; SDAG-NEXT: v_add3_u32 v3, v3, v12, v8 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v7, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v0, s[4:5], v1, v6 +; SDAG-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v11, v13 +; SDAG-NEXT: v_mul_lo_u32 v7, v11, v7 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v10, v[0:1] ; SDAG-NEXT: ; implicit-def: $vgpr9 -; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: v_add3_u32 v3, v7, v3, v8 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v0, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 ; SDAG-NEXT: .LBB6_4: ; %Flow @@ -1590,10 +1584,11 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB6_10: ; %fp-to-i-cleanup @@ -1705,33 +1700,32 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB6_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v10, 0xffffff7a, v5 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v10, v[6:7] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 -; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v10 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff7a, v5 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v5, 0xffffff3a, v5 +; GISEL-NEXT: v_sub_u32_e32 v3, 64, v2 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[3:4], v3, v[6:7] +; GISEL-NEXT: v_lshlrev_b64 v[5:6], v5, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v7, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v12, v9, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v7, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v5, v8, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v13, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v7, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v7, v7, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v7, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v5, v9, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[10:11] ; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr8 @@ -1893,28 +1887,28 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[7:8] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, 0 -; SDAG-NEXT: v_mul_lo_u32 v4, v10, v2 -; SDAG-NEXT: v_mul_lo_u32 v14, v9, v3 -; SDAG-NEXT: v_mov_b32_e32 v5, v1 -; SDAG-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13 -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v4 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v10, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v13, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v11, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v1, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v10, v2 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v12, v9, v3 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v9, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, v0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[5:6] +; SDAG-NEXT: v_add3_u32 v3, v3, v12, v8 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v7, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v0, s[4:5], v1, v6 +; SDAG-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v11, v13 +; SDAG-NEXT: v_mul_lo_u32 v7, v11, v7 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v10, v[0:1] ; SDAG-NEXT: ; implicit-def: $vgpr9 -; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: v_add3_u32 v3, v7, v3, v8 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v0, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 ; SDAG-NEXT: .LBB7_4: ; %Flow @@ -1938,10 +1932,11 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB7_10: ; %fp-to-i-cleanup @@ -2053,33 +2048,32 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB7_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v10, 0xffffff7a, v5 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v10, v[6:7] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 -; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v10 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff7a, v5 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v5, 0xffffff3a, v5 +; GISEL-NEXT: v_sub_u32_e32 v3, 64, v2 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[3:4], v3, v[6:7] +; GISEL-NEXT: v_lshlrev_b64 v[5:6], v5, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v7, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v12, v9, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v7, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v5, v8, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v13, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v7, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v7, v7, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v7, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v5, v9, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[10:11] ; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr8 diff --git a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll index 4aa49f2c9296d..1db476300c261 100644 --- a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll +++ b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll @@ -6,9 +6,9 @@ define amdgpu_kernel void @foo() { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b64 s[0:1], src_shared_base ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s1 -; CHECK-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0 -; CHECK-NEXT: flat_store_b64 v[0:1], v[2:3] +; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s1 +; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: flat_store_b64 v[1:2], v[0:1] ; CHECK-NEXT: s_endpgm entry: br label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 10d61deed71cc..76f204dd0c16a 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -82,9 +82,9 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b32 s70, s68 ; CHECK-NEXT: s_mov_b32 s71, s68 ; CHECK-NEXT: v_writelane_b32 v7, s67, 31 -; CHECK-NEXT: image_sample_lz v3, v[1:2], s[60:67], s[68:71] dmask:0x1 +; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s52, v7, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, v2 +; CHECK-NEXT: v_mov_b32_e32 v3, v2 ; CHECK-NEXT: v_readlane_b32 s53, v7, 1 ; CHECK-NEXT: v_readlane_b32 s54, v7, 2 ; CHECK-NEXT: v_readlane_b32 s55, v7, 3 @@ -97,14 +97,14 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s60, v7, 8 ; CHECK-NEXT: v_readlane_b32 s61, v7, 9 ; CHECK-NEXT: v_readlane_b32 s62, v7, 10 -; CHECK-NEXT: image_sample_lz v4, v[1:2], s[52:59], s[68:71] dmask:0x1 +; CHECK-NEXT: image_sample_lz v4, v[2:3], s[52:59], s[68:71] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s63, v7, 11 ; CHECK-NEXT: v_readlane_b32 s64, v7, 12 ; CHECK-NEXT: v_readlane_b32 s65, v7, 13 ; CHECK-NEXT: v_readlane_b32 s66, v7, 14 ; CHECK-NEXT: v_readlane_b32 s67, v7, 15 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mul_f32_e32 v0, v4, v3 +; CHECK-NEXT: v_mul_f32_e32 v0, v4, v1 ; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz .LBB0_3 @@ -118,13 +118,13 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s65, v7, 29 ; CHECK-NEXT: v_readlane_b32 s66, v7, 30 ; CHECK-NEXT: v_readlane_b32 s67, v7, 31 +; CHECK-NEXT: v_mov_b32_e32 v1, v2 ; CHECK-NEXT: s_and_b64 vcc, exec, -1 ; CHECK-NEXT: v_readlane_b32 s53, v7, 17 ; CHECK-NEXT: v_readlane_b32 s54, v7, 18 ; CHECK-NEXT: v_readlane_b32 s55, v7, 19 +; CHECK-NEXT: image_sample_lz v3, v[2:3], s[60:67], s[68:71] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s56, v7, 20 -; CHECK-NEXT: image_sample_lz v3, v[1:2], s[60:67], s[68:71] dmask:0x1 -; CHECK-NEXT: v_mov_b32_e32 v1, v2 ; CHECK-NEXT: v_readlane_b32 s57, v7, 21 ; CHECK-NEXT: v_readlane_b32 s58, v7, 22 ; CHECK-NEXT: v_readlane_b32 s59, v7, 23 diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index f705a2ffc4f1d..5e2cec504c6a9 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -5779,19 +5779,17 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX7-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2] -; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v8 +; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, v0, v8 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v2, 0 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v11, vcc, v6, v9, vcc -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v3, v[5:6] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8] ; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc ; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6] @@ -5835,19 +5833,17 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2] -; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, v0, v8 +; GFX8-GISEL-NEXT: v_add_u32_e32 v1, vcc, v0, v8 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v2, 0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, v6, v9, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v3, v[5:6] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8] ; GFX8-GISEL-NEXT: v_add_u32_e32 v7, vcc, 1, v0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 1, v4 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6] @@ -5887,19 +5883,17 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, v0, v8 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v1, vcc, v0, v8 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v2, 0 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, v6, v9, vcc -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v3, v[5:6] ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8] ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v4 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v9, vcc ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6] @@ -5989,28 +5983,26 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX10-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, 1 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v6, v3, v[1:2] -; GFX10-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, v6 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v7, v2, v[4:5] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v2, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, v4, v7, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v8, v3, v[1:2] ; GFX10-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v4, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v5, v8, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v9, v2, v[6:7] -; GFX10-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v5, 1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v3, v7, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v6, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v5, v10, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v9, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v6, v8, v[4:5] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v4, v7, v[1:2] +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v8, v2, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v8, v3, v[1:2] +; GFX10-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v0, v8 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v9, v2, v[4:5] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v1, v2, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, v6, v9, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v1, v3, v[5:6] +; GFX10-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v0, null, 0, v6, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v4, v8, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v9, v2, v[7:8] +; GFX10-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v4, 1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v4, v0, v[6:7] +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v2, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v5, v7, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v2, v8, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v5, v4, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v2, v7, v[3:4] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: clpeak_imad_pat_i64: @@ -6049,37 +6041,35 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX11-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v0, 1 +; GFX11-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v1, vcc_lo -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v7, v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v7, v3, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v2, v[4:5] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v7 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v4, v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, v5, v8, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v8, v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v8, v3, v[1:2] +; GFX11-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v0, v8 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v9, v2, v[4:5] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v1, v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, v6, v9, vcc_lo +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v1, v3, v[5:6] ; GFX11-GISEL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v5, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v7 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v4, v3, v[1:2] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v6, v11, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v0, null, 0, v6, vcc_lo +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v4, v11, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v10, v2, v[7:8] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-GISEL-NEXT: v_add_co_u32 v9, vcc_lo, v6, 1 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v6, v12, v[2:3] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v3, v9, 0 +; GFX11-GISEL-NEXT: v_add_co_u32 v9, vcc_lo, v4, 1 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v4, v0, v[6:7] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, 0 ; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v8, vcc_lo -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v11, v[4:5] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v3, v10, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v5, v9, v[6:7] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v8, v11, v[2:3] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v5, v10, v[1:2] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v9, v[6:7] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-SDAG-LABEL: clpeak_imad_pat_i64: @@ -6408,52 +6398,46 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX7-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc +; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] +; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, v0, v12 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v4, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, v10, v13, vcc ; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v2 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v5, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v6, 0 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9] -; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v14 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11] -; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v12, v15, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, v2, v16 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2] -; GFX7-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v8, v17, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15] -; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v2 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v9 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] -; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v13 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v3, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v4, v[11:12] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v7, v[2:3] +; GFX7-GISEL-NEXT: v_add_i32_e32 v9, vcc, v1, v16 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v6, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v12, vcc, v11, v17, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[3:4] +; GFX7-GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc +; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v1 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v6, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v3, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v9, 0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v2 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v7, v[6:7] +; GFX7-GISEL-NEXT: v_add_i32_e64 v16, s[4:5], 1, v8 +; GFX7-GISEL-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v13, s[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v9, v[2:3] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v10, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v16, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v15, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v14, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v6, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v17, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v15, v[11:12] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6513,52 +6497,46 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX8-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] +; GFX8-GISEL-NEXT: v_add_u32_e32 v1, vcc, v0, v12 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v4, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, v10, v13, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v2 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v5, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v6, 0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9] -; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, v0, v14 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v12, v15, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, v2, v16 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v8, v17, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15] -; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v2 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v9 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, 1, v13 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v4, v[11:12] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v7, v[2:3] +; GFX8-GISEL-NEXT: v_add_u32_e32 v9, vcc, v1, v16 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v6, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v12, vcc, v11, v17, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[3:4] +; GFX8-GISEL-NEXT: v_add_u32_e32 v9, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 1, v1 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v6, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v3, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v9, 0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v15, vcc, 1, v2 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v7, v[6:7] +; GFX8-GISEL-NEXT: v_add_u32_e64 v16, s[4:5], 1, v8 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v13, s[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v9, v[2:3] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v10, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v16, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v15, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v14, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v6, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v17, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v15, v[11:12] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6610,52 +6588,46 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX900-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX900-GISEL: ; %bb.0: ; %entry ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v1, vcc +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v1, vcc, v0, v12 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v4, v[8:9] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, v10, v13, vcc ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v2 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v5, v[9:10] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v6, 0 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, v0, v14 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11] -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v12, v15, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0 -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, v2, v16 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2] -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v19, vcc, v8, v17, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v12, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v8, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v9 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v16, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v4, v[11:12] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v7, v[2:3] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v9, vcc, v1, v16 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v6, v[3:4] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v12, vcc, v11, v17, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[3:4] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v9, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v10, vcc +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v1 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v6, v[4:5] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v11, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v3, v[5:6] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v9, 0 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v15, vcc, 1, v2 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v7, v[6:7] +; GFX900-GISEL-NEXT: v_add_co_u32_e64 v16, s[4:5], 1, v8 +; GFX900-GISEL-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, v13, s[4:5] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v9, v[2:3] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v10, v[0:1] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v16, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v15, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v14, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v6, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v17, v[3:4] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[9:10] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v15, v[11:12] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6805,50 +6777,46 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX10-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_add_co_u32 v12, vcc_lo, v0, 1 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v1, vcc_lo -; GFX10-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v2, 1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v12, v4, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v3, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v14, v6, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v12, v5, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v14, v7, v[3:4] -; GFX10-GISEL-NEXT: v_add_co_u32 v3, vcc_lo, v0, v12 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v13, v4, v[8:9] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v3, v4, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, v10, v13, vcc_lo -; GFX10-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v2, v14 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v15, v6, v[9:10] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v17, v6, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, v8, v15, vcc_lo -; GFX10-GISEL-NEXT: v_add_co_u32 v19, vcc_lo, v0, 1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v20, null, 0, v10, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v11, v19, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v3, v5, v[0:1] -; GFX10-GISEL-NEXT: v_add_co_u32 v15, vcc_lo, v2, 1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v17, v7, v[1:2] -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v8, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v13, v15, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v16, v4, v[14:15] -; GFX10-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v11, 1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v18, v6, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, v8 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v11, v20, v[1:2] -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v4, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v13, v12, v[6:7] -; GFX10-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v13, 1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v9, v14, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v5, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v4, v19, v[10:11] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v7, v17, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v5, v15, v[11:12] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v9, v16, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v7, v18, v[3:4] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v12, v14, v[5:6] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v4, v17, v[7:8] +; GFX10-GISEL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 1 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v16, vcc_lo, v2, 1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v11, v4, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v3, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v16, v6, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v11, v5, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v16, v7, v[3:4] +; GFX10-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v0, v11 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v15, v4, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v17, v6, v[9:10] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v1, v4, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v12, v15, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v16, vcc_lo, v2, v16 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, v13, v17, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v16, v6, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v1, v5, v[9:10] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v16, v7, v[11:12] +; GFX10-GISEL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 1 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v0, null, 0, v12, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v12, vcc_lo, v2, 1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], null, v3, v4, v[14:15] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v8, v11, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v13, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v13, vcc_lo, v8, 1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v18, v6, v[15:16] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v10, v12, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v16, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v10, 1 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v14, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v8, v0, v[5:6] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v6, v17, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v10, v1, v[7:8] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v4, v13, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v16, v11, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v4, v15, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v14, v12, v[9:10] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v6, v18, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v7, v13, v[4:5] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v8, v17, v[9:10] ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6911,63 +6879,60 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX11-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_co_u32 v13, vcc_lo, v0, 1 +; GFX11-GISEL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v1, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v15, vcc_lo, v2, 1 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v13, v4, 0 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v3, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v16, vcc_lo, v2, 1 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v11, v4, 0 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v3, vcc_lo ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v15, v6, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v13, v5, v[1:2] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v16, v6, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v11, v5, v[1:2] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v15, v7, v[3:4] -; GFX11-GISEL-NEXT: v_add_co_u32 v3, vcc_lo, v0, v13 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v14, v4, v[8:9] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v16, v6, v[9:10] -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v17, null, v10, v14, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v18, vcc_lo, v2, v15 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v3, v4, 0 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v19, null, v11, v16, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v18, v6, 0 -; GFX11-GISEL-NEXT: v_add_co_u32 v20, vcc_lo, v0, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v10, vcc_lo -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v9 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v8, v20, 0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v3, v5, v[0:1] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v18, v7, v[1:2] -; GFX11-GISEL-NEXT: v_add_co_u32 v18, vcc_lo, v2, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v11, vcc_lo +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v16, v7, v[3:4] +; GFX11-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v0, v11 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v15, v4, v[8:9] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v17, v4, v[13:14] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v12, v18, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v19, v6, v[14:15] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v8, v21, v[0:1] -; GFX11-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v8, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v15, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v12, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v12, v22, v[5:6] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v9, v14, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v4, v17, 0 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v10, vcc_lo -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v15, v20, v[6:7] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v10, v18, v[7:8] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v9, v16, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v4, v19, v[3:4] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v17, v6, v[9:10] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v1, v4, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v12, v15, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v18, vcc_lo, v2, v16 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v19, null, v13, v17, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v18, v6, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v1, v5, v[9:10] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v18, v7, v[11:12] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[16:17], null, v3, v4, v[14:15] +; GFX11-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v11, v14, v[7:8] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v12, v17, v[8:9] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v0, null, 0, v12, vcc_lo +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v8, v14, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[17:18], null, v19, v6, v[15:16] +; GFX11-GISEL-NEXT: v_add_co_u32 v15, vcc_lo, v2, 1 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v13, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v13, vcc_lo, v8, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v10, v15, 0 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v16, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v19, vcc_lo, v10, 1 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v20, null, 0, v17, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v8, v0, v[5:6] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v10, v1, v[7:8] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v4, v13, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v6, v19, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v16, v14, v[11:12] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v17, v15, v[8:9] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v4, v18, v[1:2] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v6, v20, v[3:4] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v9, v13, v[7:8] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v10, v19, v[11:12] ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll index 69a871f6f6ae5..fa0568d307907 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll @@ -262,12 +262,12 @@ define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) { ; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; ALIGNED-GFX10-NEXT: v_add_co_u32 v3, s0, s0, v0 +; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0 +; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[0:1], v[3:4] ; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 -; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_flat_misaligned_v2: @@ -275,12 +275,12 @@ define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) { ; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; UNALIGNED-GFX10-NEXT: v_add_co_u32 v3, s0, s0, v0 +; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0 +; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[0:1], v[3:4] ; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 -; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_flat_misaligned_v2: @@ -290,13 +290,13 @@ define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) { ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 +; ALIGNED-GFX11-NEXT: v_add_co_u32 v3, s0, s0, v0 ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; ALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, s1, 0, s0 +; ALIGNED-GFX11-NEXT: flat_load_b64 v[0:1], v[3:4] ; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 -; ALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX11-NEXT: flat_store_b64 v[3:4], v[1:2] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_flat_misaligned_v2: @@ -306,13 +306,13 @@ define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) { ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 +; UNALIGNED-GFX11-NEXT: v_add_co_u32 v3, s0, s0, v0 ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; UNALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, s1, 0, s0 +; UNALIGNED-GFX11-NEXT: flat_load_b64 v[0:1], v[3:4] ; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 -; UNALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX11-NEXT: flat_store_b64 v[3:4], v[1:2] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -462,13 +462,12 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { ; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_add_co_u32 v5, s0, s0, v0 -; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s1, 0, s0 -; ALIGNED-GFX10-NEXT: flat_load_dwordx3 v[0:2], v[5:6] +; ALIGNED-GFX10-NEXT: v_add_co_u32 v4, s0, s0, v0 +; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s1, 0, s0 +; ALIGNED-GFX10-NEXT: flat_load_dwordx3 v[1:3], v[4:5] ; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 -; ALIGNED-GFX10-NEXT: flat_store_dwordx3 v[5:6], v[2:4] +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v0, v3 +; ALIGNED-GFX10-NEXT: flat_store_dwordx3 v[4:5], v[0:2] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_flat_misaligned_v3: @@ -476,13 +475,12 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { ; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_add_co_u32 v5, s0, s0, v0 -; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s1, 0, s0 -; UNALIGNED-GFX10-NEXT: flat_load_dwordx3 v[0:2], v[5:6] +; UNALIGNED-GFX10-NEXT: v_add_co_u32 v4, s0, s0, v0 +; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s1, 0, s0 +; UNALIGNED-GFX10-NEXT: flat_load_dwordx3 v[1:3], v[4:5] ; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 -; UNALIGNED-GFX10-NEXT: flat_store_dwordx3 v[5:6], v[2:4] +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v0, v3 +; UNALIGNED-GFX10-NEXT: flat_store_dwordx3 v[4:5], v[0:2] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_flat_misaligned_v3: @@ -492,13 +490,13 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_add_co_u32 v5, s0, s0, v0 +; ALIGNED-GFX11-NEXT: v_add_co_u32 v4, s0, s0, v0 ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s1, 0, s0 -; ALIGNED-GFX11-NEXT: flat_load_b96 v[0:2], v[5:6] +; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, s1, 0, s0 +; ALIGNED-GFX11-NEXT: flat_load_b96 v[1:3], v[4:5] ; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 -; ALIGNED-GFX11-NEXT: flat_store_b96 v[5:6], v[2:4] +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v0, v3 +; ALIGNED-GFX11-NEXT: flat_store_b96 v[4:5], v[0:2] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_flat_misaligned_v3: @@ -508,13 +506,13 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_add_co_u32 v5, s0, s0, v0 +; UNALIGNED-GFX11-NEXT: v_add_co_u32 v4, s0, s0, v0 ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s1, 0, s0 -; UNALIGNED-GFX11-NEXT: flat_load_b96 v[0:2], v[5:6] +; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, s1, 0, s0 +; UNALIGNED-GFX11-NEXT: flat_load_b96 v[1:3], v[4:5] ; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 -; UNALIGNED-GFX11-NEXT: flat_store_b96 v[5:6], v[2:4] +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v0, v3 +; UNALIGNED-GFX11-NEXT: flat_store_b96 v[4:5], v[0:2] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -606,36 +604,33 @@ define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) { ; SPLIT: ; %bb.0: ; %bb ; SPLIT-NEXT: s_load_dword s0, s[4:5], 0x24 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_lshl_add_u32 v5, v0, 2, s0 -; SPLIT-NEXT: ds_read_b96 v[0:2], v5 +; SPLIT-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; SPLIT-NEXT: ds_read_b96 v[1:3], v4 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_mov_b32_e32 v3, v0 -; SPLIT-NEXT: v_mov_b32_e32 v4, v1 -; SPLIT-NEXT: ds_write_b96 v5, v[2:4] +; SPLIT-NEXT: v_mov_b32_e32 v0, v3 +; SPLIT-NEXT: ds_write_b96 v4, v[0:2] ; SPLIT-NEXT: s_endpgm ; ; ALIGNED-GFX10-LABEL: test_local_aligned_v3: ; ALIGNED-GFX10: ; %bb.0: ; %bb ; ALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v5, v0, 2, s0 -; ALIGNED-GFX10-NEXT: ds_read_b96 v[0:2], v5 +; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; ALIGNED-GFX10-NEXT: ds_read_b96 v[1:3], v4 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 -; ALIGNED-GFX10-NEXT: ds_write_b96 v5, v[2:4] +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v0, v3 +; ALIGNED-GFX10-NEXT: ds_write_b96 v4, v[0:2] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_local_aligned_v3: ; UNALIGNED-GFX10: ; %bb.0: ; %bb ; UNALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v5, v0, 2, s0 -; UNALIGNED-GFX10-NEXT: ds_read_b96 v[0:2], v5 +; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; UNALIGNED-GFX10-NEXT: ds_read_b96 v[1:3], v4 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 -; UNALIGNED-GFX10-NEXT: ds_write_b96 v5, v[2:4] +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v0, v3 +; UNALIGNED-GFX10-NEXT: ds_write_b96 v4, v[0:2] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_local_aligned_v3: @@ -644,11 +639,11 @@ define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) { ; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v5, v0, 2, s0 -; ALIGNED-GFX11-NEXT: ds_load_b96 v[0:2], v5 +; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; ALIGNED-GFX11-NEXT: ds_load_b96 v[1:3], v4 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 -; ALIGNED-GFX11-NEXT: ds_store_b96 v5, v[2:4] +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v0, v3 +; ALIGNED-GFX11-NEXT: ds_store_b96 v4, v[0:2] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_local_aligned_v3: @@ -657,11 +652,11 @@ define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) { ; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v5, v0, 2, s0 -; UNALIGNED-GFX11-NEXT: ds_load_b96 v[0:2], v5 +; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; UNALIGNED-GFX11-NEXT: ds_load_b96 v[1:3], v4 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 -; UNALIGNED-GFX11-NEXT: ds_store_b96 v5, v[2:4] +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v0, v3 +; UNALIGNED-GFX11-NEXT: ds_store_b96 v4, v[0:2] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -683,12 +678,12 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { ; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_add_co_u32 v0, s0, s0, v0 -; SPLIT-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; SPLIT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; SPLIT-NEXT: v_add_co_u32 v3, s0, s0, v0 +; SPLIT-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0 +; SPLIT-NEXT: flat_load_dwordx2 v[0:1], v[3:4] ; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SPLIT-NEXT: v_mov_b32_e32 v4, v2 -; SPLIT-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; SPLIT-NEXT: v_mov_b32_e32 v2, v0 +; SPLIT-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; SPLIT-NEXT: s_endpgm ; ; ALIGNED-GFX10-LABEL: test_flat_aligned_v2: @@ -696,12 +691,12 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { ; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; ALIGNED-GFX10-NEXT: v_add_co_u32 v3, s0, s0, v0 +; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0 +; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[0:1], v[3:4] ; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 -; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_flat_aligned_v2: @@ -709,12 +704,12 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { ; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; UNALIGNED-GFX10-NEXT: v_add_co_u32 v3, s0, s0, v0 +; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0 +; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[0:1], v[3:4] ; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 -; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_flat_aligned_v2: @@ -724,13 +719,13 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 +; ALIGNED-GFX11-NEXT: v_add_co_u32 v3, s0, s0, v0 ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; ALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, s1, 0, s0 +; ALIGNED-GFX11-NEXT: flat_load_b64 v[0:1], v[3:4] ; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 -; ALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX11-NEXT: flat_store_b64 v[3:4], v[1:2] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_flat_aligned_v2: @@ -740,13 +735,13 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 +; UNALIGNED-GFX11-NEXT: v_add_co_u32 v3, s0, s0, v0 ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; UNALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, s1, 0, s0 +; UNALIGNED-GFX11-NEXT: flat_load_b64 v[0:1], v[3:4] ; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 -; UNALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX11-NEXT: flat_store_b64 v[3:4], v[1:2] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -942,21 +937,19 @@ define amdgpu_kernel void @test_flat_v4_aligned8(ptr %arg) { ; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_add_co_u32 v0, s0, s0, v0 -; SPLIT-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; SPLIT-NEXT: v_add_co_u32 v2, vcc_lo, v0, 8 -; SPLIT-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; SPLIT-NEXT: v_add_co_u32 v6, s0, s0, v0 +; SPLIT-NEXT: v_add_co_ci_u32_e64 v7, s0, s1, 0, s0 +; SPLIT-NEXT: v_add_co_u32 v8, vcc_lo, v6, 8 +; SPLIT-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v7, vcc_lo ; SPLIT-NEXT: s_clause 0x1 -; SPLIT-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; SPLIT-NEXT: flat_load_dwordx2 v[6:7], v[2:3] +; SPLIT-NEXT: flat_load_dwordx2 v[0:1], v[6:7] +; SPLIT-NEXT: flat_load_dwordx2 v[3:4], v[8:9] ; SPLIT-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; SPLIT-NEXT: v_mov_b32_e32 v8, v5 -; SPLIT-NEXT: v_mov_b32_e32 v9, v4 +; SPLIT-NEXT: v_mov_b32_e32 v2, v0 ; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SPLIT-NEXT: v_mov_b32_e32 v4, v7 -; SPLIT-NEXT: v_mov_b32_e32 v5, v6 -; SPLIT-NEXT: flat_store_dwordx2 v[2:3], v[8:9] -; SPLIT-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; SPLIT-NEXT: v_mov_b32_e32 v5, v3 +; SPLIT-NEXT: flat_store_dwordx2 v[8:9], v[1:2] +; SPLIT-NEXT: flat_store_dwordx2 v[6:7], v[4:5] ; SPLIT-NEXT: s_endpgm ; ; ALIGNED-GFX10-LABEL: test_flat_v4_aligned8: diff --git a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir index 02eda2c4822c2..e5b68b48158da 100644 --- a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir +++ b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir @@ -5,7 +5,35 @@ # source. # No more registers shall be defined --- -name: main +name: limit_coalesce +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr16, $sgpr17 + + ; CHECK-LABEL: name: limit_coalesce + ; CHECK: liveins: $sgpr16, $sgpr17 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub1:sgpr_64 = COPY $sgpr17 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub0:sgpr_64 = COPY $sgpr16 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: INLINEASM &"; def $0", 0 /* attdialect */, 3407882 /* regdef:VReg_64 */, def %4 + ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_128 = COPY %4.sub1 + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_]], [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: SI_RETURN + %0:sgpr_32 = COPY killed $sgpr17 + %1:sgpr_32 = COPY killed $sgpr16 + undef %2.sub0:sgpr_64 = COPY killed %1 + %2.sub1:sgpr_64 = COPY killed %0 + %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + INLINEASM &"; def $0", 0 /* attdialect */, 3407882 /* regdef:VReg_64 */, def %4:vreg_64 + undef %5.sub0:vreg_128 = COPY killed %4.sub1 + GLOBAL_STORE_DWORDX4_SADDR killed %3, killed %5, killed %2, 0, 0, implicit $exec :: (store (s128), addrspace 1) + SI_RETURN +... + +--- +name: allow_coalesce tracksRegLiveness: true registers: - { id: 0, class: sreg_32_xm0, preferred-register: '%0' } @@ -14,23 +42,18 @@ body: | bb.0: liveins: $sgpr0, $vgpr0_vgpr1 - ; CHECK-LABEL: name: main + ; CHECK-LABEL: name: allow_coalesce ; CHECK: liveins: $sgpr0, $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $sgpr0 - ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[DEF]].sub0 - ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64 = COPY [[COPY]].sub1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0 - ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_96 = IMPLICIT_DEF - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:vreg_96 = COPY [[DEF1]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:vreg_96 = COPY [[DEF]].sub0 - ; CHECK-NEXT: FLAT_STORE_DWORDX3 $vgpr0_vgpr1, [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_128 = IMPLICIT_DEF - ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF2]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub3:vreg_128 = COPY [[DEF]].sub0 - ; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY3]], 0, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $sgpr0 + ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, [[COPY]].sub0_sub1, 0, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_96 = IMPLICIT_DEF + ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:vreg_96 = COPY [[DEF]] + ; CHECK-NEXT: FLAT_STORE_DWORDX3 $vgpr0_vgpr1, [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_128 = IMPLICIT_DEF + ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF1]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub3:vreg_128 = COPY undef [[COPY]].sub2 + ; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr %2:vreg_64 = IMPLICIT_DEF undef %3.sub0:vreg_64 = COPY $sgpr0 %3.sub1:vreg_64 = COPY %2.sub0 @@ -49,3 +72,4 @@ body: | FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %8, 0, 0, implicit $exec, implicit $flat_scr ... + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index 8bb7274c84620..76b97e843d777 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -78,7 +78,6 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v4, v3, 20, 11 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0xfffffc01, v4 @@ -93,12 +92,12 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; SI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] -; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_mov_b32_e32 v2, 0x3ff00000 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; SI-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc -; SI-NEXT: v_bfi_b32 v3, s2, v2, v3 -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-NEXT: v_bfi_b32 v2, s2, v2, v3 +; SI-NEXT: v_add_f64 v[2:3], v[4:5], v[1:2] ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -113,16 +112,15 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; CI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[2:3] ; CI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] +; CI-NEXT: v_mov_b32_e32 v2, 0x3ff00000 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; CI-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc -; CI-NEXT: v_bfi_b32 v3, s2, v2, v3 -; CI-NEXT: v_mov_b32_e32 v2, v1 -; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; CI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; CI-NEXT: v_bfi_b32 v2, s2, v2, v3 +; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[1:2] ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 59f4a9d44bbdd..d23c49165ec70 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -5985,14 +5985,13 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 16, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v1, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v3, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 @@ -6011,23 +6010,22 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-HSA-NEXT: v_bfe_i32 v4, v3, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 -; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i64: @@ -6044,11 +6042,10 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v3, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll index 062a985dd7180..b4c0b7497b95f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll @@ -5734,20 +5734,19 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b64 v[0:1], v0 -; SI-NEXT: v_mov_b32_e32 v9, s0 +; SI-NEXT: v_mov_b32_e32 v8, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; SI-NEXT: v_bfe_i32 v3, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v5, v0, 0, 16 -; SI-NEXT: v_bfe_i32 v7, v4, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; SI-NEXT: ds_write2_b64 v9, v[3:4], v[1:2] offset0:2 offset1:3 -; SI-NEXT: ds_write2_b64 v9, v[5:6], v[7:8] offset1:1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v1 +; SI-NEXT: v_bfe_i32 v4, v1, 0, 16 +; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v6, v5, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; SI-NEXT: ds_write2_b64 v8, v[4:5], v[2:3] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v8, v[0:1], v[6:7] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v4i16_to_v4i64: @@ -5757,20 +5756,20 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-NO-DS128-NEXT: ds_read_b64 v[0:1], v0 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NO-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3 -; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset1:1 +; VI-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v7, v4, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v5, v2, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[1:2], v[7:8] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v4i16_to_v4i64: @@ -5779,20 +5778,20 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NO-DS128-NEXT: ds_read_b64 v[0:1], v0 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, s0 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v9, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset1:1 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v7, v4, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v2, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v9, v[1:2], v[7:8] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v9, v[3:4], v[5:6] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v4i16_to_v4i64: @@ -5846,22 +5845,21 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 -; VI-DS128-NEXT: ds_read_b64 v[0:1], v0 +; VI-DS128-NEXT: ds_read_b64 v[1:2], v0 ; VI-DS128-NEXT: v_mov_b32_e32 v8, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_mov_b32_e32 v3, v1 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v0, v2, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v4, v1, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v6, v3, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:16 -; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] +; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16 +; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v4i16_to_v4i64: @@ -5869,22 +5867,21 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-DS128-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-DS128-NEXT: ds_read_b64 v[1:2], v0 ; GFX9-DS128-NEXT: v_mov_b32_e32 v8, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v0, v2, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX9-DS128-NEXT: v_bfe_i32 v4, v1, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v6, v3, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:16 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16 +; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] ; GFX9-DS128-NEXT: s_endpgm %load = load <4 x i16>, ptr addrspace(3) %in %ext = sext <4 x i16> %load to <4 x i64> diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index 08ec0c847e941..87d52684e588c 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -632,12 +632,12 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3] ; GFX1100-NEXT: v_ashrrev_i32_e32 v5, 31, v5 -; GFX1100-NEXT: v_mov_b32_e32 v3, v1 +; GFX1100-NEXT: v_mad_u64_u32 v[2:3], null, v5, v4, v[1:2] ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mad_u64_u32 v[1:2], null, v5, v4, v[3:4] +; GFX1100-NEXT: v_mov_b32_e32 v1, v2 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: mad_i64_i32_extops_i32_i64: @@ -775,13 +775,13 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) # ; GFX1100-LABEL: mad_u64_u32_bitops_lhs_mask_small: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v0 -; GFX1100-NEXT: v_mov_b32_e32 v6, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5] -; GFX1100-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_and_b32 v5, 1, v6 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] +; GFX1100-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v6, v1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[4:5] +; GFX1100-NEXT: v_and_b32_e32 v5, 1, v6 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_mad_u64_u32 v[3:4], null, v5, v2, v[1:2] +; GFX1100-NEXT: v_mov_b32_e32 v1, v3 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: mad_u64_u32_bitops_lhs_mask_small: @@ -863,11 +863,12 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) # ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_mov_b32_e32 v6, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5] -; GFX1100-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v4, 1, v3 +; GFX1100-NEXT: v_and_b32_e32 v4, 1, v3 +; GFX1100-NEXT: v_mad_u64_u32 v[2:3], null, v6, v4, v[1:2] ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4] +; GFX1100-NEXT: v_mov_b32_e32 v1, v2 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: mad_u64_u32_bitops_rhs_mask_small: @@ -1807,10 +1808,9 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 { ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, 0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0 -; GFX1100-NEXT: v_mad_u64_u32 v[5:6], null, v2, v0, v[1:2] -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_mad_u64_u32 v[5:6], null, v2, v0, v[4:5] +; GFX1100-NEXT: v_mov_b32_e32 v4, 0 ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4] ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; @@ -1818,10 +1818,9 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 { ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1150-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, 0 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0 -; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v2, v0, v[1:2] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v2, v0, v[4:5] +; GFX1150-NEXT: v_mov_b32_e32 v4, 0 ; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, 0xfffffc88, v0, v[3:4] ; GFX1150-NEXT: s_setpc_b64 s[30:31] ; @@ -1833,10 +1832,9 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v1, v0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v0, v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v0, v[4:5] +; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v0, v[3:4] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2126,23 +2124,21 @@ define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 { ; GFX1100-LABEL: lshr_mad_i64_negative_4: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1] +; GFX1100-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_mov_b32_e32 v0, v3 -; GFX1100-NEXT: v_mad_u64_u32 v[3:4], null, v1, v1, v[0:1] +; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[2:3] +; GFX1100-NEXT: v_mad_u64_u32 v[4:5], null, v3, v3, v[1:2] ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1100-NEXT: v_mov_b32_e32 v1, v4 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: lshr_mad_i64_negative_4: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1150-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, v[0:1] +; GFX1150-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, v0 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mov_b32_e32 v0, v4 -; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v1, v1, v[0:1] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1150-NEXT: v_mov_b32_e32 v0, v3 +; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v2, v1, v[1:2] +; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v2, v2, v[1:2] ; GFX1150-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: lshr_mad_i64_negative_4: @@ -2152,12 +2148,10 @@ define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v1, v0, v[0:1] +; GFX12-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v0, v4 -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v1, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v1, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v2, v2, v[1:2] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: lshr_mad_i64_negative_4: diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index d29847e40dc8b..4681d589ac217 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -2989,34 +2989,33 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4c ; VI-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x7c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v5, 0 +; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s12, v0, 0 +; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s12, v0, 0 ; VI-NEXT: s_mul_i32 s4, s12, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s8, v6, 0 -; VI-NEXT: s_mul_i32 s6, s13, s10 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_mov_b32_e32 v4, v1 -; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s9, v6, v[4:5] -; VI-NEXT: v_mov_b32_e32 v8, s8 -; VI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s14, v8, v[2:3] -; VI-NEXT: v_mov_b32_e32 v4, v6 -; VI-NEXT: v_mov_b32_e32 v6, s13 -; VI-NEXT: v_mad_u64_u32 v[1:2], s[4:5], s8, v6, v[4:5] +; VI-NEXT: v_mov_b32_e32 v5, s12 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: s_mul_i32 s4, s13, s10 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v0 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s8, v5, 0 +; VI-NEXT: v_mov_b32_e32 v7, s8 +; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s14, v7, v[3:4] +; VI-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s9, v5, v[1:2] +; VI-NEXT: v_mov_b32_e32 v7, s13 ; VI-NEXT: s_mul_i32 s6, s15, s8 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v9 -; VI-NEXT: v_add_u32_e32 v2, vcc, v7, v2 -; VI-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc -; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s9, v6, v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, v5 +; VI-NEXT: v_mad_u64_u32 v[1:2], s[4:5], s8, v7, v[1:2] +; VI-NEXT: v_add_u32_e32 v8, vcc, s6, v4 +; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v2 +; VI-NEXT: v_addc_u32_e64 v5, s[4:5], 0, 0, vcc +; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s9, v7, v[4:5] ; VI-NEXT: s_mul_i32 s6, s14, s9 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v8 +; VI-NEXT: v_add_u32_e32 v6, vcc, s6, v8 +; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v3 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -3370,67 +3369,66 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0 -; VI-NEXT: v_mov_b32_e32 v10, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v12, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v12, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: flat_load_dwordx4 v[4:7], v[12:13] +; VI-NEXT: flat_load_dwordx4 v[4:7], v[11:12] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_lo_u32 v3, v4, v3 -; VI-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v4, v2, 0 -; VI-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 +; VI-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v4, v2, 0 ; VI-NEXT: v_mul_lo_u32 v2, v5, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, v15, v3 -; VI-NEXT: v_add_u32_e32 v15, vcc, v3, v2 -; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, v4, v[9:10] -; VI-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v6, v0, v[14:15] -; VI-NEXT: v_mov_b32_e32 v9, v2 -; VI-NEXT: v_mul_lo_u32 v2, v7, v0 -; VI-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[9:10] +; VI-NEXT: v_mul_lo_u32 v10, v7, v0 +; VI-NEXT: v_mad_u64_u32 v[7:8], s[0:1], v0, v4, 0 +; VI-NEXT: v_add_u32_e32 v3, vcc, v14, v3 +; VI-NEXT: v_add_u32_e32 v14, vcc, v3, v2 +; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, v4, v[8:9] +; VI-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v6, v0, v[13:14] +; VI-NEXT: v_mov_b32_e32 v8, v2 +; VI-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v5, v[8:9] ; VI-NEXT: v_mul_lo_u32 v4, v6, v1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v2, v15 -; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v10 +; VI-NEXT: v_add_u32_e32 v6, vcc, v10, v14 +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v9 ; VI-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[2:3] ; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v6 -; VI-NEXT: v_add_u32_e32 v10, vcc, v0, v14 -; VI-NEXT: v_addc_u32_e32 v11, vcc, v1, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-NEXT: v_add_u32_e32 v9, vcc, v0, v13 +; VI-NEXT: v_addc_u32_e32 v10, vcc, v1, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[11:12], v[7:10] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_mul_i128: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 4, v0 -; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v14, s[0:1] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v14, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v10, v5, v2 -; GFX9-NEXT: v_mul_lo_u32 v12, v4, v3 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v2, 0 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0 -; GFX9-NEXT: v_add3_u32 v9, v9, v12, v10 -; GFX9-NEXT: v_mul_lo_u32 v15, v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v10, v3 -; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v1, v4, v[10:11] -; GFX9-NEXT: v_mov_b32_e32 v10, v12 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[10:11] -; GFX9-NEXT: v_mul_lo_u32 v10, v7, v0 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v6, v0, v[8:9] -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v13, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[8:9] -; GFX9-NEXT: v_add3_u32 v5, v10, v7, v15 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v5, vcc -; GFX9-NEXT: global_store_dwordx4 v14, v[2:5], s[2:3] +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 +; GFX9-NEXT: v_mul_lo_u32 v14, v5, v2 +; GFX9-NEXT: v_mul_lo_u32 v15, v4, v3 +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[0:1], v1, v4, v[9:10] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, v11 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[9:10] +; GFX9-NEXT: v_add3_u32 v3, v3, v15, v14 +; GFX9-NEXT: v_mul_lo_u32 v4, v6, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v0, v[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v10 +; GFX9-NEXT: v_mul_lo_u32 v14, v7, v0 +; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7] +; GFX9-NEXT: v_add3_u32 v3, v14, v3, v4 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v3, vcc +; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_i128: @@ -3468,37 +3466,36 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_lshlrev_b32 v17, 4, v0 +; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v17, s[0:1] -; GFX11-NEXT: global_load_b128 v[4:7], v17, s[2:3] +; GFX11-NEXT: global_load_b128 v[0:3], v15, s[0:1] +; GFX11-NEXT: global_load_b128 v[4:7], v15, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v0, v4, 0 -; GFX11-NEXT: v_mul_lo_u32 v18, v5, v2 -; GFX11-NEXT: v_mul_lo_u32 v3, v4, v3 -; GFX11-NEXT: v_mad_u64_u32 v[15:16], null, v4, v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v11, v9 -; GFX11-NEXT: v_add3_u32 v16, v16, v3, v18 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v1, v4, v[11:12] -; GFX11-NEXT: v_mul_lo_u32 v4, v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v11, v13 +; GFX11-NEXT: v_mul_lo_u32 v16, v5, v2 +; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v4, v2, 0 +; GFX11-NEXT: v_mul_lo_u32 v17, v6, v1 +; GFX11-NEXT: v_mul_lo_u32 v18, v7, v0 +; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v1, v4, v[9:10] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v0, v5, v[11:12] -; GFX11-NEXT: v_mul_lo_u32 v12, v7, v0 -; GFX11-NEXT: v_add_co_u32 v2, s0, v14, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0 -; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v6, v0, v[15:16] -; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v1, v5, v[2:3] +; GFX11-NEXT: v_mov_b32_e32 v9, v11 +; GFX11-NEXT: v_mul_lo_u32 v11, v4, v3 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, v[9:10] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add3_u32 v0, v12, v11, v4 -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v14, v14, v11, v16 +; GFX11-NEXT: v_add_co_u32 v3, s0, v12, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, 0, 0, s0 +; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v6, v0, v[13:14] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v1, v5, v[3:4] +; GFX11-NEXT: v_add3_u32 v0, v18, v10, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v9 ; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, v7, v0, vcc_lo -; GFX11-NEXT: global_store_b128 v17, v[8:11], s[2:3] +; GFX11-NEXT: v_mov_b32_e32 v9, v2 +; GFX11-NEXT: global_store_b128 v15, v[8:11], s[2:3] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_mul_i128: diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 8abbdad893819..bbc04aa46adc5 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -203,28 +203,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v13, v8, v14 ; GFX9-NEXT: .LBB0_6: ; %Flow3 ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX9-NEXT: v_mul_lo_u32 v18, v13, v5 +; GFX9-NEXT: v_mul_lo_u32 v16, v13, v5 ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v23, v13, 0 -; GFX9-NEXT: v_mov_b32_e32 v15, 0 -; GFX9-NEXT: v_mul_lo_u32 v9, v11, v4 -; GFX9-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v4, 0 -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v22, v13, v[14:15] -; GFX9-NEXT: v_add3_u32 v8, v8, v18, v9 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v23, v[7:8] -; GFX9-NEXT: v_mov_b32_e32 v14, v16 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v23, v11, v[14:15] +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, 0 +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v22, v13, v[6:7] +; GFX9-NEXT: v_mul_lo_u32 v15, v11, v4 ; GFX9-NEXT: v_mul_lo_u32 v4, v12, v22 -; GFX9-NEXT: v_mul_lo_u32 v12, v10, v23 -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v17, v9 -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v22, v11, v[9:10] -; GFX9-NEXT: v_add3_u32 v4, v12, v7, v4 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v9, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v10, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v23, v11, v[6:7] +; GFX9-NEXT: v_add3_u32 v9, v9, v16, v15 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v23, v[8:9] +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v14, v7 +; GFX9-NEXT: v_addc_co_u32_e64 v13, s[4:5], 0, 0, vcc +; GFX9-NEXT: v_mul_lo_u32 v15, v10, v23 +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v11, v[12:13] +; GFX9-NEXT: v_add3_u32 v4, v15, v9, v4 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v10, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v4, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v20 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v21 @@ -1590,25 +1589,24 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: v_mul_lo_u32 v19, v12, v7 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v12, 0 -; GFX9-NEXT: v_mov_b32_e32 v17, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_mul_lo_u32 v18, v13, v6 -; GFX9-NEXT: v_mov_b32_e32 v16, v8 -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, 0 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v12, v[16:17] -; GFX9-NEXT: v_add3_u32 v10, v10, v19, v18 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v4, v[9:10] -; GFX9-NEXT: v_mov_b32_e32 v16, v11 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v13, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v5, v12, v[8:9] +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v13, v[8:9] +; GFX9-NEXT: v_add3_u32 v11, v11, v19, v18 +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v4, v[10:11] ; GFX9-NEXT: v_mul_lo_u32 v6, v14, v5 -; GFX9-NEXT: v_mul_lo_u32 v14, v15, v4 -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11 -; GFX9-NEXT: v_addc_co_u32_e64 v12, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v13, v[11:12] -; GFX9-NEXT: v_add3_u32 v6, v14, v9, v6 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, v17, v9 +; GFX9-NEXT: v_mul_lo_u32 v12, v15, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[4:5], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v13, v[14:15] +; GFX9-NEXT: v_add3_u32 v6, v12, v11, v6 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v6, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v10, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll index 9b3dc7f531021..287d1dde21403 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v2f32_v2f32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v2f32_v2f32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,16 +152,14 @@ define void @v_shuffle_v2f32_v2f32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +204,14 @@ define void @v_shuffle_v2f32_v2f32__3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -304,12 +299,12 @@ define void @v_shuffle_v2f32_v2f32__3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -347,12 +342,11 @@ define void @v_shuffle_v2f32_v2f32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -473,12 +467,11 @@ define void @v_shuffle_v2f32_v2f32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -593,12 +586,12 @@ define void @v_shuffle_v2f32_v2f32__1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -724,12 +717,11 @@ define void @v_shuffle_v2f32_v2f32__1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -874,11 +866,11 @@ define void @v_shuffle_v2f32_v2f32__1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll index 34043cd067b25..d5998e289c09d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll @@ -99,36 +99,33 @@ define void @v_shuffle_v2f32_v3f32__2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -194,36 +191,33 @@ define void @v_shuffle_v2f32_v3f32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -237,49 +231,45 @@ define void @v_shuffle_v2f32_v3f32__5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -293,46 +283,43 @@ define void @v_shuffle_v2f32_v3f32__5_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -346,15 +333,14 @@ define void @v_shuffle_v2f32_v3f32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -362,15 +348,14 @@ define void @v_shuffle_v2f32_v3f32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -378,15 +363,14 @@ define void @v_shuffle_v2f32_v3f32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -400,13 +384,12 @@ define void @v_shuffle_v2f32_v3f32__5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -445,36 +428,37 @@ define void @v_shuffle_v2f32_v3f32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -488,38 +472,37 @@ define void @v_shuffle_v2f32_v3f32__5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -533,12 +516,11 @@ define void @v_shuffle_v2f32_v3f32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -659,13 +641,12 @@ define void @v_shuffle_v2f32_v3f32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -703,12 +684,11 @@ define void @v_shuffle_v2f32_v3f32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -918,36 +898,37 @@ define void @v_shuffle_v2f32_v3f32__2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1178,38 +1159,37 @@ define void @v_shuffle_v2f32_v3f32__2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1408,36 +1388,33 @@ define void @v_shuffle_v2f32_v3f32__2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1654,42 +1631,42 @@ define void @v_shuffle_v2f32_v3f32__2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[3:4], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1939,43 +1916,42 @@ define void @v_shuffle_v2f32_v3f32__2_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2175,7 +2151,7 @@ define void @s_shuffle_v2f32_v3f32__2_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2187,7 +2163,7 @@ define void @s_shuffle_v2f32_v3f32__2_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2199,7 +2175,7 @@ define void @s_shuffle_v2f32_v3f32__2_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2274,7 +2250,7 @@ define void @s_shuffle_v2f32_v3f32__5_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2286,7 +2262,7 @@ define void @s_shuffle_v2f32_v3f32__5_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2298,7 +2274,7 @@ define void @s_shuffle_v2f32_v3f32__5_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2317,11 +2293,11 @@ define void @s_shuffle_v2f32_v3f32__5_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s8 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2333,11 +2309,11 @@ define void @s_shuffle_v2f32_v3f32__5_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s8 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2347,13 +2323,13 @@ define void @s_shuffle_v2f32_v3f32__5_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2372,10 +2348,11 @@ define void @s_shuffle_v2f32_v3f32__5_1() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2387,10 +2364,11 @@ define void @s_shuffle_v2f32_v3f32__5_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2400,12 +2378,13 @@ define void @s_shuffle_v2f32_v3f32__5_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2424,11 +2403,11 @@ define void @s_shuffle_v2f32_v3f32__5_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2440,11 +2419,11 @@ define void @s_shuffle_v2f32_v3f32__5_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2454,13 +2433,13 @@ define void @s_shuffle_v2f32_v3f32__5_2() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2479,8 +2458,8 @@ define void @s_shuffle_v2f32_v3f32__5_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2492,8 +2471,8 @@ define void @s_shuffle_v2f32_v3f32__5_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2505,8 +2484,8 @@ define void @s_shuffle_v2f32_v3f32__5_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2519,17 +2498,44 @@ define void @s_shuffle_v2f32_v3f32__5_3() { } define void @s_shuffle_v2f32_v3f32__5_4() { -; GFX9-LABEL: s_shuffle_v2f32_v3f32__5_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2f32_v3f32__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2f32_v3f32__5_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=s"() %vec1 = call <3 x float> asm "; def $0", "=s"() %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> @@ -2544,8 +2550,8 @@ define void @s_shuffle_v2f32_v3f32__5_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2557,8 +2563,8 @@ define void @s_shuffle_v2f32_v3f32__5_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2570,8 +2576,8 @@ define void @s_shuffle_v2f32_v3f32__5_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2695,8 +2701,8 @@ define void @s_shuffle_v2f32_v3f32__2_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2708,8 +2714,8 @@ define void @s_shuffle_v2f32_v3f32__2_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2721,8 +2727,8 @@ define void @s_shuffle_v2f32_v3f32__2_0() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2929,17 +2935,44 @@ define void @s_shuffle_v2f32_v3f32__1_1() { } define void @s_shuffle_v2f32_v3f32__2_1() { -; GFX9-LABEL: s_shuffle_v2f32_v3f32__2_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2f32_v3f32__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2f32_v3f32__2_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=s"() %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x float> %shuf) @@ -3150,8 +3183,8 @@ define void @s_shuffle_v2f32_v3f32__2_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3163,8 +3196,8 @@ define void @s_shuffle_v2f32_v3f32__2_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3176,8 +3209,8 @@ define void @s_shuffle_v2f32_v3f32__2_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3388,7 +3421,7 @@ define void @s_shuffle_v2f32_v3f32__2_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3400,7 +3433,7 @@ define void @s_shuffle_v2f32_v3f32__2_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3412,7 +3445,7 @@ define void @s_shuffle_v2f32_v3f32__2_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3635,10 +3668,11 @@ define void @s_shuffle_v2f32_v3f32__2_4() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3650,10 +3684,11 @@ define void @s_shuffle_v2f32_v3f32__2_4() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3663,12 +3698,13 @@ define void @s_shuffle_v2f32_v3f32__2_4() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3897,11 +3933,11 @@ define void @s_shuffle_v2f32_v3f32__2_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3913,11 +3949,11 @@ define void @s_shuffle_v2f32_v3f32__2_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3927,13 +3963,13 @@ define void @s_shuffle_v2f32_v3f32__2_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s6 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll index 676a521757bd8..a86ca0a4a23c6 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v2i32_v2i32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v2i32_v2i32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,16 +152,14 @@ define void @v_shuffle_v2i32_v2i32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +204,14 @@ define void @v_shuffle_v2i32_v2i32__3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -304,12 +299,12 @@ define void @v_shuffle_v2i32_v2i32__3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -347,12 +342,11 @@ define void @v_shuffle_v2i32_v2i32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -473,12 +467,11 @@ define void @v_shuffle_v2i32_v2i32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -593,12 +586,12 @@ define void @v_shuffle_v2i32_v2i32__1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -724,12 +717,11 @@ define void @v_shuffle_v2i32_v2i32__1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -874,11 +866,11 @@ define void @v_shuffle_v2i32_v2i32__1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll index f65340470feb1..d46ca61cff64d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll @@ -99,36 +99,33 @@ define void @v_shuffle_v2i32_v3i32__2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -194,36 +191,33 @@ define void @v_shuffle_v2i32_v3i32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -237,49 +231,45 @@ define void @v_shuffle_v2i32_v3i32__5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -293,46 +283,43 @@ define void @v_shuffle_v2i32_v3i32__5_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -346,15 +333,14 @@ define void @v_shuffle_v2i32_v3i32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -362,15 +348,14 @@ define void @v_shuffle_v2i32_v3i32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -378,15 +363,14 @@ define void @v_shuffle_v2i32_v3i32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -400,13 +384,12 @@ define void @v_shuffle_v2i32_v3i32__5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -445,36 +428,37 @@ define void @v_shuffle_v2i32_v3i32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -488,38 +472,37 @@ define void @v_shuffle_v2i32_v3i32__5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -533,12 +516,11 @@ define void @v_shuffle_v2i32_v3i32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -659,13 +641,12 @@ define void @v_shuffle_v2i32_v3i32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -703,12 +684,11 @@ define void @v_shuffle_v2i32_v3i32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -918,36 +898,37 @@ define void @v_shuffle_v2i32_v3i32__2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1178,38 +1159,37 @@ define void @v_shuffle_v2i32_v3i32__2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1408,36 +1388,33 @@ define void @v_shuffle_v2i32_v3i32__2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1654,42 +1631,42 @@ define void @v_shuffle_v2i32_v3i32__2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[3:4], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1939,43 +1916,42 @@ define void @v_shuffle_v2i32_v3i32__2_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2175,7 +2151,7 @@ define void @s_shuffle_v2i32_v3i32__2_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2187,7 +2163,7 @@ define void @s_shuffle_v2i32_v3i32__2_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2199,7 +2175,7 @@ define void @s_shuffle_v2i32_v3i32__2_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2274,7 +2250,7 @@ define void @s_shuffle_v2i32_v3i32__5_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2286,7 +2262,7 @@ define void @s_shuffle_v2i32_v3i32__5_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2298,7 +2274,7 @@ define void @s_shuffle_v2i32_v3i32__5_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2317,11 +2293,11 @@ define void @s_shuffle_v2i32_v3i32__5_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s8 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2333,11 +2309,11 @@ define void @s_shuffle_v2i32_v3i32__5_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s8 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2347,13 +2323,13 @@ define void @s_shuffle_v2i32_v3i32__5_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2372,10 +2348,11 @@ define void @s_shuffle_v2i32_v3i32__5_1() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2387,10 +2364,11 @@ define void @s_shuffle_v2i32_v3i32__5_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2400,12 +2378,13 @@ define void @s_shuffle_v2i32_v3i32__5_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2424,11 +2403,11 @@ define void @s_shuffle_v2i32_v3i32__5_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2440,11 +2419,11 @@ define void @s_shuffle_v2i32_v3i32__5_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2454,13 +2433,13 @@ define void @s_shuffle_v2i32_v3i32__5_2() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2479,8 +2458,8 @@ define void @s_shuffle_v2i32_v3i32__5_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2492,8 +2471,8 @@ define void @s_shuffle_v2i32_v3i32__5_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2505,8 +2484,8 @@ define void @s_shuffle_v2i32_v3i32__5_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2519,17 +2498,44 @@ define void @s_shuffle_v2i32_v3i32__5_3() { } define void @s_shuffle_v2i32_v3i32__5_4() { -; GFX9-LABEL: s_shuffle_v2i32_v3i32__5_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2i32_v3i32__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2i32_v3i32__5_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=s"() %vec1 = call <3 x i32> asm "; def $0", "=s"() %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> @@ -2544,8 +2550,8 @@ define void @s_shuffle_v2i32_v3i32__5_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2557,8 +2563,8 @@ define void @s_shuffle_v2i32_v3i32__5_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2570,8 +2576,8 @@ define void @s_shuffle_v2i32_v3i32__5_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2695,8 +2701,8 @@ define void @s_shuffle_v2i32_v3i32__2_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2708,8 +2714,8 @@ define void @s_shuffle_v2i32_v3i32__2_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2721,8 +2727,8 @@ define void @s_shuffle_v2i32_v3i32__2_0() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2929,17 +2935,44 @@ define void @s_shuffle_v2i32_v3i32__1_1() { } define void @s_shuffle_v2i32_v3i32__2_1() { -; GFX9-LABEL: s_shuffle_v2i32_v3i32__2_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2i32_v3i32__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2i32_v3i32__2_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=s"() %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x i32> %shuf) @@ -3150,8 +3183,8 @@ define void @s_shuffle_v2i32_v3i32__2_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3163,8 +3196,8 @@ define void @s_shuffle_v2i32_v3i32__2_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3176,8 +3209,8 @@ define void @s_shuffle_v2i32_v3i32__2_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3388,7 +3421,7 @@ define void @s_shuffle_v2i32_v3i32__2_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3400,7 +3433,7 @@ define void @s_shuffle_v2i32_v3i32__2_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3412,7 +3445,7 @@ define void @s_shuffle_v2i32_v3i32__2_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3635,10 +3668,11 @@ define void @s_shuffle_v2i32_v3i32__2_4() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3650,10 +3684,11 @@ define void @s_shuffle_v2i32_v3i32__2_4() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3663,12 +3698,13 @@ define void @s_shuffle_v2i32_v3i32__2_4() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3897,11 +3933,11 @@ define void @s_shuffle_v2i32_v3i32__2_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3913,11 +3949,11 @@ define void @s_shuffle_v2i32_v3i32__2_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3927,13 +3963,13 @@ define void @s_shuffle_v2i32_v3i32__2_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s6 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll index 299dfba482953..02fb06ef54d42 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v2p3_v2p3__1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v2p3_v2p3__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,16 +152,14 @@ define void @v_shuffle_v2p3_v2p3__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +204,14 @@ define void @v_shuffle_v2p3_v2p3__3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -304,12 +299,12 @@ define void @v_shuffle_v2p3_v2p3__3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -347,12 +342,11 @@ define void @v_shuffle_v2p3_v2p3__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -473,12 +467,11 @@ define void @v_shuffle_v2p3_v2p3__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -593,12 +586,12 @@ define void @v_shuffle_v2p3_v2p3__1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -724,12 +717,11 @@ define void @v_shuffle_v2p3_v2p3__1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -874,11 +866,11 @@ define void @v_shuffle_v2p3_v2p3__1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll index 13e3d94c35446..d0f00f8363aed 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll @@ -99,36 +99,33 @@ define void @v_shuffle_v2p3_v3p3__2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -194,36 +191,33 @@ define void @v_shuffle_v2p3_v3p3__5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -237,49 +231,45 @@ define void @v_shuffle_v2p3_v3p3__5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -293,46 +283,43 @@ define void @v_shuffle_v2p3_v3p3__5_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -346,15 +333,14 @@ define void @v_shuffle_v2p3_v3p3__5_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -362,15 +348,14 @@ define void @v_shuffle_v2p3_v3p3__5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -378,15 +363,14 @@ define void @v_shuffle_v2p3_v3p3__5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -400,13 +384,12 @@ define void @v_shuffle_v2p3_v3p3__5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -445,36 +428,37 @@ define void @v_shuffle_v2p3_v3p3__5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -488,38 +472,37 @@ define void @v_shuffle_v2p3_v3p3__5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -533,12 +516,11 @@ define void @v_shuffle_v2p3_v3p3__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -659,13 +641,12 @@ define void @v_shuffle_v2p3_v3p3__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -703,12 +684,11 @@ define void @v_shuffle_v2p3_v3p3__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -918,36 +898,37 @@ define void @v_shuffle_v2p3_v3p3__2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1178,38 +1159,37 @@ define void @v_shuffle_v2p3_v3p3__2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1408,36 +1388,33 @@ define void @v_shuffle_v2p3_v3p3__2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1654,42 +1631,42 @@ define void @v_shuffle_v2p3_v3p3__2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[3:4], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1939,43 +1916,42 @@ define void @v_shuffle_v2p3_v3p3__2_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2175,7 +2151,7 @@ define void @s_shuffle_v2p3_v3p3__2_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2187,7 +2163,7 @@ define void @s_shuffle_v2p3_v3p3__2_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2199,7 +2175,7 @@ define void @s_shuffle_v2p3_v3p3__2_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2274,7 +2250,7 @@ define void @s_shuffle_v2p3_v3p3__5_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2286,7 +2262,7 @@ define void @s_shuffle_v2p3_v3p3__5_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2298,7 +2274,7 @@ define void @s_shuffle_v2p3_v3p3__5_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2317,11 +2293,11 @@ define void @s_shuffle_v2p3_v3p3__5_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s8 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2333,11 +2309,11 @@ define void @s_shuffle_v2p3_v3p3__5_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s8 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2347,13 +2323,13 @@ define void @s_shuffle_v2p3_v3p3__5_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2372,10 +2348,11 @@ define void @s_shuffle_v2p3_v3p3__5_1() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2387,10 +2364,11 @@ define void @s_shuffle_v2p3_v3p3__5_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2400,12 +2378,13 @@ define void @s_shuffle_v2p3_v3p3__5_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2424,11 +2403,11 @@ define void @s_shuffle_v2p3_v3p3__5_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2440,11 +2419,11 @@ define void @s_shuffle_v2p3_v3p3__5_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2454,13 +2433,13 @@ define void @s_shuffle_v2p3_v3p3__5_2() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2479,8 +2458,8 @@ define void @s_shuffle_v2p3_v3p3__5_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2492,8 +2471,8 @@ define void @s_shuffle_v2p3_v3p3__5_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2505,8 +2484,8 @@ define void @s_shuffle_v2p3_v3p3__5_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2519,17 +2498,44 @@ define void @s_shuffle_v2p3_v3p3__5_3() { } define void @s_shuffle_v2p3_v3p3__5_4() { -; GFX9-LABEL: s_shuffle_v2p3_v3p3__5_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2p3_v3p3__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2p3_v3p3__5_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> @@ -2544,8 +2550,8 @@ define void @s_shuffle_v2p3_v3p3__5_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2557,8 +2563,8 @@ define void @s_shuffle_v2p3_v3p3__5_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2570,8 +2576,8 @@ define void @s_shuffle_v2p3_v3p3__5_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2695,8 +2701,8 @@ define void @s_shuffle_v2p3_v3p3__2_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2708,8 +2714,8 @@ define void @s_shuffle_v2p3_v3p3__2_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2721,8 +2727,8 @@ define void @s_shuffle_v2p3_v3p3__2_0() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2929,17 +2935,44 @@ define void @s_shuffle_v2p3_v3p3__1_1() { } define void @s_shuffle_v2p3_v3p3__2_1() { -; GFX9-LABEL: s_shuffle_v2p3_v3p3__2_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2p3_v3p3__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2p3_v3p3__2_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) @@ -3150,8 +3183,8 @@ define void @s_shuffle_v2p3_v3p3__2_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3163,8 +3196,8 @@ define void @s_shuffle_v2p3_v3p3__2_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3176,8 +3209,8 @@ define void @s_shuffle_v2p3_v3p3__2_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3388,7 +3421,7 @@ define void @s_shuffle_v2p3_v3p3__2_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3400,7 +3433,7 @@ define void @s_shuffle_v2p3_v3p3__2_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3412,7 +3445,7 @@ define void @s_shuffle_v2p3_v3p3__2_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3635,10 +3668,11 @@ define void @s_shuffle_v2p3_v3p3__2_4() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3650,10 +3684,11 @@ define void @s_shuffle_v2p3_v3p3__2_4() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3663,12 +3698,13 @@ define void @s_shuffle_v2p3_v3p3__2_4() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3897,11 +3933,11 @@ define void @s_shuffle_v2p3_v3p3__2_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3913,11 +3949,11 @@ define void @s_shuffle_v2p3_v3p3__2_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3927,13 +3963,13 @@ define void @s_shuffle_v2p3_v3p3__2_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s6 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll index 430f64164d24f..35cf10f1135c9 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3f32_v2f32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v3f32_v2f32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,15 +152,14 @@ define void @v_shuffle_v3f32_v2f32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +205,14 @@ define void @v_shuffle_v3f32_v2f32__3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -263,10 +259,10 @@ define void @v_shuffle_v3f32_v2f32__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -307,12 +303,12 @@ define void @v_shuffle_v3f32_v2f32__3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -350,15 +346,15 @@ define void @v_shuffle_v3f32_v2f32__3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,14 +399,14 @@ define void @v_shuffle_v3f32_v2f32__3_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -458,11 +454,11 @@ define void @v_shuffle_v3f32_v2f32__3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -503,13 +499,13 @@ define void @v_shuffle_v3f32_v2f32__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -561,26 +557,25 @@ define void @v_shuffle_v3f32_v2f32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -638,13 +633,13 @@ define void @v_shuffle_v3f32_v2f32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -696,26 +691,25 @@ define void @v_shuffle_v3f32_v2f32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -729,15 +723,14 @@ define void @v_shuffle_v3f32_v2f32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -784,15 +777,14 @@ define void @v_shuffle_v3f32_v2f32__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -836,16 +828,15 @@ define void @v_shuffle_v3f32_v2f32__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -891,16 +882,15 @@ define void @v_shuffle_v3f32_v2f32__3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1033,13 +1023,13 @@ define void @v_shuffle_v3f32_v2f32__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1121,16 +1111,15 @@ define void @v_shuffle_v3f32_v2f32__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1176,15 +1165,14 @@ define void @v_shuffle_v3f32_v2f32__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1230,15 +1218,14 @@ define void @v_shuffle_v3f32_v2f32__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1289,13 +1276,12 @@ define void @v_shuffle_v3f32_v2f32__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1393,12 +1379,11 @@ define void @v_shuffle_v3f32_v2f32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1446,13 +1431,13 @@ define void @v_shuffle_v3f32_v2f32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,13 +1478,12 @@ define void @v_shuffle_v3f32_v2f32__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1539,30 +1523,29 @@ define void @v_shuffle_v3f32_v2f32__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1570,16 +1553,16 @@ define void @v_shuffle_v3f32_v2f32__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -1595,14 +1578,14 @@ define void @v_shuffle_v3f32_v2f32__3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1746,15 +1729,14 @@ define void @v_shuffle_v3f32_v2f32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1844,12 +1826,12 @@ define void @v_shuffle_v3f32_v2f32__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1890,15 +1872,14 @@ define void @v_shuffle_v3f32_v2f32__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1947,15 +1928,14 @@ define void @v_shuffle_v3f32_v2f32__3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2001,12 +1981,13 @@ define void @v_shuffle_v3f32_v2f32__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2370,7 +2351,29 @@ define void @s_shuffle_v3f32_v2f32__3_3_u() { } define void @s_shuffle_v3f32_v2f32__3_3_0() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_0: +; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_3_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_3_1() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -2380,13 +2383,13 @@ define void @s_shuffle_v3f32_v2f32__3_3_0() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_0: +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -2396,13 +2399,13 @@ define void @s_shuffle_v3f32_v2f32__3_3_0() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_3_0: +; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -2412,115 +2415,31 @@ define void @s_shuffle_v3f32_v2f32__3_3_0() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s10, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %vec1 = call <2 x float> asm "; def $0", "=s"() - %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) - ret void -} - -define void @s_shuffle_v3f32_v2f32__3_3_1() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_3_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <2 x float> asm "; def $0", "=s"() - %vec1 = call <2 x float> asm "; def $0", "=s"() - %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) ret void } define void @s_shuffle_v3f32_v2f32__3_3_2() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_3_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_3_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %vec1 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> @@ -2549,44 +2468,17 @@ define void @s_shuffle_v3f32_v2f32__3_3_3() { } define void @s_shuffle_v3f32_v2f32__u_0_0() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__u_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__u_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__u_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__u_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) @@ -2613,47 +2505,18 @@ define void @s_shuffle_v3f32_v2f32__0_0_0() { } define void @s_shuffle_v3f32_v2f32__1_0_0() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__1_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__1_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__1_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__1_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) @@ -2661,44 +2524,17 @@ define void @s_shuffle_v3f32_v2f32__1_0_0() { } define void @s_shuffle_v3f32_v2f32__2_0_0() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__2_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) @@ -2710,14 +2546,13 @@ define void @s_shuffle_v3f32_v2f32__3_0_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2727,14 +2562,13 @@ define void @s_shuffle_v3f32_v2f32__3_0_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2744,14 +2578,13 @@ define void @s_shuffle_v3f32_v2f32__3_0_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s10 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2768,13 +2601,12 @@ define void @s_shuffle_v3f32_v2f32__3_u_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2784,13 +2616,12 @@ define void @s_shuffle_v3f32_v2f32__3_u_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2800,13 +2631,12 @@ define void @s_shuffle_v3f32_v2f32__3_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2823,14 +2653,13 @@ define void @s_shuffle_v3f32_v2f32__3_1_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2840,14 +2669,13 @@ define void @s_shuffle_v3f32_v2f32__3_1_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2857,14 +2685,13 @@ define void @s_shuffle_v3f32_v2f32__3_1_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2881,14 +2708,13 @@ define void @s_shuffle_v3f32_v2f32__3_2_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s6 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2898,14 +2724,13 @@ define void @s_shuffle_v3f32_v2f32__3_2_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s6 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2915,14 +2740,13 @@ define void @s_shuffle_v3f32_v2f32__3_2_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s2 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3344,47 +3168,18 @@ define void @s_shuffle_v3f32_v2f32__2_2_2() { } define void @s_shuffle_v3f32_v2f32__3_2_2() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %vec1 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> @@ -3393,44 +3188,17 @@ define void @s_shuffle_v3f32_v2f32__3_2_2() { } define void @s_shuffle_v3f32_v2f32__3_u_2() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_u_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_u_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_u_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %vec1 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> @@ -3446,11 +3214,10 @@ define void @s_shuffle_v3f32_v2f32__3_0_2() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s8, s11 ; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -3463,11 +3230,10 @@ define void @s_shuffle_v3f32_v2f32__3_0_2() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s8, s11 ; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -3480,11 +3246,10 @@ define void @s_shuffle_v3f32_v2f32__3_0_2() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 +; GFX942-NEXT: s_mov_b32 s8, s11 ; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3497,53 +3262,20 @@ define void @s_shuffle_v3f32_v2f32__3_0_2() { } define void @s_shuffle_v3f32_v2f32__3_1_2() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_1_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_1_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_1_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_1_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %vec1 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll index ef670e963bdb6..befc1126d6fa4 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3f32_v3f32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -153,12 +152,11 @@ define void @v_shuffle_v3f32_v3f32__4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -240,15 +238,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -355,9 +352,8 @@ define void @v_shuffle_v3f32_v3f32__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,13 +399,12 @@ define void @v_shuffle_v3f32_v3f32__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -495,9 +490,8 @@ define void @v_shuffle_v3f32_v3f32__5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -540,14 +534,12 @@ define void @v_shuffle_v3f32_v3f32__5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -597,16 +589,14 @@ define void @v_shuffle_v3f32_v3f32__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -656,15 +646,14 @@ define void @v_shuffle_v3f32_v3f32__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -760,14 +749,13 @@ define void @v_shuffle_v3f32_v3f32__5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -813,10 +801,10 @@ define void @v_shuffle_v3f32_v3f32__5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -857,12 +845,11 @@ define void @v_shuffle_v3f32_v3f32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -994,13 +981,12 @@ define void @v_shuffle_v3f32_v3f32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1041,12 +1027,11 @@ define void @v_shuffle_v3f32_v3f32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1088,14 +1073,12 @@ define void @v_shuffle_v3f32_v3f32__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1145,16 +1128,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1376,16 +1358,15 @@ define void @v_shuffle_v3f32_v3f32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1574,10 +1555,10 @@ define void @v_shuffle_v3f32_v3f32__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1704,15 +1685,14 @@ define void @v_shuffle_v3f32_v3f32__4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1816,15 +1796,14 @@ define void @v_shuffle_v3f32_v3f32__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1872,16 +1851,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1931,16 +1908,15 @@ define void @v_shuffle_v3f32_v3f32__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1989,16 +1965,15 @@ define void @v_shuffle_v3f32_v3f32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2100,12 +2075,12 @@ define void @v_shuffle_v3f32_v3f32__u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2184,13 +2159,12 @@ define void @v_shuffle_v3f32_v3f32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2232,10 +2206,10 @@ define void @v_shuffle_v3f32_v3f32__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2274,12 +2248,12 @@ define void @v_shuffle_v3f32_v3f32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2317,15 +2291,14 @@ define void @v_shuffle_v3f32_v3f32__4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2374,13 +2347,13 @@ define void @v_shuffle_v3f32_v3f32__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2480,16 +2453,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2591,15 +2563,15 @@ define void @v_shuffle_v3f32_v3f32__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2751,12 +2723,11 @@ define void @v_shuffle_v3f32_v3f32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2895,13 +2866,12 @@ define void @v_shuffle_v3f32_v3f32__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2988,16 +2958,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3103,10 +3071,8 @@ define void @v_shuffle_v3f32_v3f32__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3306,9 +3272,8 @@ define void @v_shuffle_v3f32_v3f32__1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3456,10 +3421,10 @@ define void @v_shuffle_v3f32_v3f32__4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3545,13 +3510,12 @@ define void @v_shuffle_v3f32_v3f32__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3592,15 +3556,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3705,14 +3669,13 @@ define void @v_shuffle_v3f32_v3f32__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3760,13 +3723,12 @@ define void @v_shuffle_v3f32_v3f32__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3807,12 +3769,12 @@ define void @v_shuffle_v3f32_v3f32__u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__u_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3913,9 +3875,9 @@ define void @v_shuffle_v3f32_v3f32__1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3963,15 +3925,15 @@ define void @v_shuffle_v3f32_v3f32__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4060,13 +4022,12 @@ define void @v_shuffle_v3f32_v3f32__4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__4_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4150,15 +4111,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4262,15 +4223,15 @@ define void @v_shuffle_v3f32_v3f32__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4316,14 +4277,13 @@ define void @v_shuffle_v3f32_v3f32__5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll index ea4fac3b1d2b1..51d45922893b3 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3i32_v2i32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v3i32_v2i32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,15 +152,14 @@ define void @v_shuffle_v3i32_v2i32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +205,14 @@ define void @v_shuffle_v3i32_v2i32__3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -263,10 +259,10 @@ define void @v_shuffle_v3i32_v2i32__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -307,12 +303,12 @@ define void @v_shuffle_v3i32_v2i32__3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -350,15 +346,15 @@ define void @v_shuffle_v3i32_v2i32__3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,14 +399,14 @@ define void @v_shuffle_v3i32_v2i32__3_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -458,11 +454,11 @@ define void @v_shuffle_v3i32_v2i32__3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -503,13 +499,13 @@ define void @v_shuffle_v3i32_v2i32__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -561,26 +557,25 @@ define void @v_shuffle_v3i32_v2i32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -638,13 +633,13 @@ define void @v_shuffle_v3i32_v2i32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -696,26 +691,25 @@ define void @v_shuffle_v3i32_v2i32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -729,15 +723,14 @@ define void @v_shuffle_v3i32_v2i32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -784,15 +777,14 @@ define void @v_shuffle_v3i32_v2i32__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -836,16 +828,15 @@ define void @v_shuffle_v3i32_v2i32__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -891,16 +882,15 @@ define void @v_shuffle_v3i32_v2i32__3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1033,13 +1023,13 @@ define void @v_shuffle_v3i32_v2i32__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1121,16 +1111,15 @@ define void @v_shuffle_v3i32_v2i32__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1176,15 +1165,14 @@ define void @v_shuffle_v3i32_v2i32__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1230,15 +1218,14 @@ define void @v_shuffle_v3i32_v2i32__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1289,13 +1276,12 @@ define void @v_shuffle_v3i32_v2i32__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1393,12 +1379,11 @@ define void @v_shuffle_v3i32_v2i32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1446,13 +1431,13 @@ define void @v_shuffle_v3i32_v2i32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,13 +1478,12 @@ define void @v_shuffle_v3i32_v2i32__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1539,30 +1523,29 @@ define void @v_shuffle_v3i32_v2i32__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1570,16 +1553,16 @@ define void @v_shuffle_v3i32_v2i32__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -1595,14 +1578,14 @@ define void @v_shuffle_v3i32_v2i32__3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1746,15 +1729,14 @@ define void @v_shuffle_v3i32_v2i32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1844,12 +1826,12 @@ define void @v_shuffle_v3i32_v2i32__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1890,15 +1872,14 @@ define void @v_shuffle_v3i32_v2i32__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1947,15 +1928,14 @@ define void @v_shuffle_v3i32_v2i32__3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2001,12 +1981,13 @@ define void @v_shuffle_v3i32_v2i32__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2370,7 +2351,29 @@ define void @s_shuffle_v3i32_v2i32__3_3_u() { } define void @s_shuffle_v3i32_v2i32__3_3_0() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_0: +; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_3_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_3_1() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -2380,13 +2383,13 @@ define void @s_shuffle_v3i32_v2i32__3_3_0() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_0: +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -2396,13 +2399,13 @@ define void @s_shuffle_v3i32_v2i32__3_3_0() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_3_0: +; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -2412,115 +2415,31 @@ define void @s_shuffle_v3i32_v2i32__3_3_0() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s10, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %vec1 = call <2 x i32> asm "; def $0", "=s"() - %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) - ret void -} - -define void @s_shuffle_v3i32_v2i32__3_3_1() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_3_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <2 x i32> asm "; def $0", "=s"() - %vec1 = call <2 x i32> asm "; def $0", "=s"() - %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) ret void } define void @s_shuffle_v3i32_v2i32__3_3_2() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_3_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_3_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %vec1 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> @@ -2549,44 +2468,17 @@ define void @s_shuffle_v3i32_v2i32__3_3_3() { } define void @s_shuffle_v3i32_v2i32__u_0_0() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__u_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__u_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__u_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__u_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) @@ -2613,47 +2505,18 @@ define void @s_shuffle_v3i32_v2i32__0_0_0() { } define void @s_shuffle_v3i32_v2i32__1_0_0() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__1_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__1_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__1_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__1_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) @@ -2661,44 +2524,17 @@ define void @s_shuffle_v3i32_v2i32__1_0_0() { } define void @s_shuffle_v3i32_v2i32__2_0_0() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__2_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) @@ -2710,14 +2546,13 @@ define void @s_shuffle_v3i32_v2i32__3_0_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2727,14 +2562,13 @@ define void @s_shuffle_v3i32_v2i32__3_0_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2744,14 +2578,13 @@ define void @s_shuffle_v3i32_v2i32__3_0_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s10 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2768,13 +2601,12 @@ define void @s_shuffle_v3i32_v2i32__3_u_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2784,13 +2616,12 @@ define void @s_shuffle_v3i32_v2i32__3_u_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2800,13 +2631,12 @@ define void @s_shuffle_v3i32_v2i32__3_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2823,14 +2653,13 @@ define void @s_shuffle_v3i32_v2i32__3_1_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2840,14 +2669,13 @@ define void @s_shuffle_v3i32_v2i32__3_1_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2857,14 +2685,13 @@ define void @s_shuffle_v3i32_v2i32__3_1_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2881,14 +2708,13 @@ define void @s_shuffle_v3i32_v2i32__3_2_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s6 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2898,14 +2724,13 @@ define void @s_shuffle_v3i32_v2i32__3_2_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s6 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2915,14 +2740,13 @@ define void @s_shuffle_v3i32_v2i32__3_2_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s2 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3344,47 +3168,18 @@ define void @s_shuffle_v3i32_v2i32__2_2_2() { } define void @s_shuffle_v3i32_v2i32__3_2_2() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %vec1 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> @@ -3393,44 +3188,17 @@ define void @s_shuffle_v3i32_v2i32__3_2_2() { } define void @s_shuffle_v3i32_v2i32__3_u_2() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_u_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_u_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_u_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %vec1 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> @@ -3446,11 +3214,10 @@ define void @s_shuffle_v3i32_v2i32__3_0_2() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s8, s11 ; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -3463,11 +3230,10 @@ define void @s_shuffle_v3i32_v2i32__3_0_2() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s8, s11 ; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -3480,11 +3246,10 @@ define void @s_shuffle_v3i32_v2i32__3_0_2() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 +; GFX942-NEXT: s_mov_b32 s8, s11 ; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3497,53 +3262,20 @@ define void @s_shuffle_v3i32_v2i32__3_0_2() { } define void @s_shuffle_v3i32_v2i32__3_1_2() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_1_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_1_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_1_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_1_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %vec1 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll index 7061c13b28d03..89e6a2918a68c 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3i32_v3i32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -153,12 +152,11 @@ define void @v_shuffle_v3i32_v3i32__4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -240,15 +238,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -355,9 +352,8 @@ define void @v_shuffle_v3i32_v3i32__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,13 +399,12 @@ define void @v_shuffle_v3i32_v3i32__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -495,9 +490,8 @@ define void @v_shuffle_v3i32_v3i32__5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -540,14 +534,12 @@ define void @v_shuffle_v3i32_v3i32__5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -597,16 +589,14 @@ define void @v_shuffle_v3i32_v3i32__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -656,15 +646,14 @@ define void @v_shuffle_v3i32_v3i32__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -760,14 +749,13 @@ define void @v_shuffle_v3i32_v3i32__5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -813,10 +801,10 @@ define void @v_shuffle_v3i32_v3i32__5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -857,12 +845,11 @@ define void @v_shuffle_v3i32_v3i32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -994,13 +981,12 @@ define void @v_shuffle_v3i32_v3i32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1041,12 +1027,11 @@ define void @v_shuffle_v3i32_v3i32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1088,14 +1073,12 @@ define void @v_shuffle_v3i32_v3i32__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1145,16 +1128,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1376,16 +1358,15 @@ define void @v_shuffle_v3i32_v3i32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1574,10 +1555,10 @@ define void @v_shuffle_v3i32_v3i32__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1704,15 +1685,14 @@ define void @v_shuffle_v3i32_v3i32__4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1816,15 +1796,14 @@ define void @v_shuffle_v3i32_v3i32__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1872,16 +1851,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1931,16 +1908,15 @@ define void @v_shuffle_v3i32_v3i32__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1989,16 +1965,15 @@ define void @v_shuffle_v3i32_v3i32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2100,12 +2075,12 @@ define void @v_shuffle_v3i32_v3i32__u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2184,13 +2159,12 @@ define void @v_shuffle_v3i32_v3i32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2232,10 +2206,10 @@ define void @v_shuffle_v3i32_v3i32__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2274,12 +2248,12 @@ define void @v_shuffle_v3i32_v3i32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2317,15 +2291,14 @@ define void @v_shuffle_v3i32_v3i32__4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2374,13 +2347,13 @@ define void @v_shuffle_v3i32_v3i32__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2480,16 +2453,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2591,15 +2563,15 @@ define void @v_shuffle_v3i32_v3i32__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2751,12 +2723,11 @@ define void @v_shuffle_v3i32_v3i32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2895,13 +2866,12 @@ define void @v_shuffle_v3i32_v3i32__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2988,16 +2958,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3103,10 +3071,8 @@ define void @v_shuffle_v3i32_v3i32__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3306,9 +3272,8 @@ define void @v_shuffle_v3i32_v3i32__1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3456,10 +3421,10 @@ define void @v_shuffle_v3i32_v3i32__4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3545,13 +3510,12 @@ define void @v_shuffle_v3i32_v3i32__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3592,15 +3556,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3705,14 +3669,13 @@ define void @v_shuffle_v3i32_v3i32__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3760,13 +3723,12 @@ define void @v_shuffle_v3i32_v3i32__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3807,12 +3769,12 @@ define void @v_shuffle_v3i32_v3i32__u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__u_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3913,9 +3875,9 @@ define void @v_shuffle_v3i32_v3i32__1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3963,15 +3925,15 @@ define void @v_shuffle_v3i32_v3i32__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4060,13 +4022,12 @@ define void @v_shuffle_v3i32_v3i32__4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__4_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4150,15 +4111,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4262,15 +4223,15 @@ define void @v_shuffle_v3i32_v3i32__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4316,14 +4277,13 @@ define void @v_shuffle_v3i32_v3i32__5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll index bd0100a4ffdb5..25e087bd922ac 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3p3_v2p3__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v3p3_v2p3__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,15 +152,14 @@ define void @v_shuffle_v3p3_v2p3__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +205,14 @@ define void @v_shuffle_v3p3_v2p3__3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -263,10 +259,10 @@ define void @v_shuffle_v3p3_v2p3__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -307,12 +303,12 @@ define void @v_shuffle_v3p3_v2p3__3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -350,15 +346,15 @@ define void @v_shuffle_v3p3_v2p3__3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,14 +399,14 @@ define void @v_shuffle_v3p3_v2p3__3_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -458,11 +454,11 @@ define void @v_shuffle_v3p3_v2p3__3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -503,13 +499,13 @@ define void @v_shuffle_v3p3_v2p3__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -561,26 +557,25 @@ define void @v_shuffle_v3p3_v2p3__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -638,13 +633,13 @@ define void @v_shuffle_v3p3_v2p3__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -696,26 +691,25 @@ define void @v_shuffle_v3p3_v2p3__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -729,15 +723,14 @@ define void @v_shuffle_v3p3_v2p3__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -784,15 +777,14 @@ define void @v_shuffle_v3p3_v2p3__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -836,16 +828,15 @@ define void @v_shuffle_v3p3_v2p3__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -891,16 +882,15 @@ define void @v_shuffle_v3p3_v2p3__3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1033,13 +1023,13 @@ define void @v_shuffle_v3p3_v2p3__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1121,16 +1111,15 @@ define void @v_shuffle_v3p3_v2p3__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1176,15 +1165,14 @@ define void @v_shuffle_v3p3_v2p3__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1230,15 +1218,14 @@ define void @v_shuffle_v3p3_v2p3__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1289,13 +1276,12 @@ define void @v_shuffle_v3p3_v2p3__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1393,12 +1379,11 @@ define void @v_shuffle_v3p3_v2p3__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1446,13 +1431,13 @@ define void @v_shuffle_v3p3_v2p3__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,13 +1478,12 @@ define void @v_shuffle_v3p3_v2p3__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1539,30 +1523,29 @@ define void @v_shuffle_v3p3_v2p3__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1570,16 +1553,16 @@ define void @v_shuffle_v3p3_v2p3__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -1595,14 +1578,14 @@ define void @v_shuffle_v3p3_v2p3__3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1746,15 +1729,14 @@ define void @v_shuffle_v3p3_v2p3__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1844,12 +1826,12 @@ define void @v_shuffle_v3p3_v2p3__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1890,15 +1872,14 @@ define void @v_shuffle_v3p3_v2p3__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1947,15 +1928,14 @@ define void @v_shuffle_v3p3_v2p3__3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2001,12 +1981,13 @@ define void @v_shuffle_v3p3_v2p3__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2370,7 +2351,29 @@ define void @s_shuffle_v3p3_v2p3__3_3_u() { } define void @s_shuffle_v3p3_v2p3__3_3_0() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_0: +; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_3_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_3_1() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -2380,13 +2383,13 @@ define void @s_shuffle_v3p3_v2p3__3_3_0() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_0: +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -2396,13 +2399,13 @@ define void @s_shuffle_v3p3_v2p3__3_3_0() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_3_0: +; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -2412,115 +2415,31 @@ define void @s_shuffle_v3p3_v2p3__3_3_0() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s10, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() - %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) - ret void -} - -define void @s_shuffle_v3p3_v2p3__3_3_1() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_3_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() - %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() - %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) ret void } define void @s_shuffle_v3p3_v2p3__3_3_2() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_3_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_3_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> @@ -2549,44 +2468,17 @@ define void @s_shuffle_v3p3_v2p3__3_3_3() { } define void @s_shuffle_v3p3_v2p3__u_0_0() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__u_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__u_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__u_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__u_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) @@ -2613,47 +2505,18 @@ define void @s_shuffle_v3p3_v2p3__0_0_0() { } define void @s_shuffle_v3p3_v2p3__1_0_0() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__1_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__1_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__1_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__1_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) @@ -2661,44 +2524,17 @@ define void @s_shuffle_v3p3_v2p3__1_0_0() { } define void @s_shuffle_v3p3_v2p3__2_0_0() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__2_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) @@ -2710,14 +2546,13 @@ define void @s_shuffle_v3p3_v2p3__3_0_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2727,14 +2562,13 @@ define void @s_shuffle_v3p3_v2p3__3_0_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2744,14 +2578,13 @@ define void @s_shuffle_v3p3_v2p3__3_0_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s10 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2768,13 +2601,12 @@ define void @s_shuffle_v3p3_v2p3__3_u_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2784,13 +2616,12 @@ define void @s_shuffle_v3p3_v2p3__3_u_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2800,13 +2631,12 @@ define void @s_shuffle_v3p3_v2p3__3_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2823,14 +2653,13 @@ define void @s_shuffle_v3p3_v2p3__3_1_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2840,14 +2669,13 @@ define void @s_shuffle_v3p3_v2p3__3_1_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2857,14 +2685,13 @@ define void @s_shuffle_v3p3_v2p3__3_1_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2881,14 +2708,13 @@ define void @s_shuffle_v3p3_v2p3__3_2_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s6 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2898,14 +2724,13 @@ define void @s_shuffle_v3p3_v2p3__3_2_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s6 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2915,14 +2740,13 @@ define void @s_shuffle_v3p3_v2p3__3_2_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s2 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3344,47 +3168,18 @@ define void @s_shuffle_v3p3_v2p3__2_2_2() { } define void @s_shuffle_v3p3_v2p3__3_2_2() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> @@ -3393,44 +3188,17 @@ define void @s_shuffle_v3p3_v2p3__3_2_2() { } define void @s_shuffle_v3p3_v2p3__3_u_2() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_u_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_u_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_u_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> @@ -3446,11 +3214,10 @@ define void @s_shuffle_v3p3_v2p3__3_0_2() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s8, s11 ; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -3463,11 +3230,10 @@ define void @s_shuffle_v3p3_v2p3__3_0_2() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s8, s11 ; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -3480,11 +3246,10 @@ define void @s_shuffle_v3p3_v2p3__3_0_2() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 +; GFX942-NEXT: s_mov_b32 s8, s11 ; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3497,53 +3262,20 @@ define void @s_shuffle_v3p3_v2p3__3_0_2() { } define void @s_shuffle_v3p3_v2p3__3_1_2() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_1_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_1_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_1_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_1_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll index cecd2a0e4b015..62b9da9fedb95 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3p3_v3p3__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -153,12 +152,11 @@ define void @v_shuffle_v3p3_v3p3__4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -240,15 +238,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -355,9 +352,8 @@ define void @v_shuffle_v3p3_v3p3__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,13 +399,12 @@ define void @v_shuffle_v3p3_v3p3__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -495,9 +490,8 @@ define void @v_shuffle_v3p3_v3p3__5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -540,14 +534,12 @@ define void @v_shuffle_v3p3_v3p3__5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -597,16 +589,14 @@ define void @v_shuffle_v3p3_v3p3__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -656,15 +646,14 @@ define void @v_shuffle_v3p3_v3p3__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -760,14 +749,13 @@ define void @v_shuffle_v3p3_v3p3__5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -813,10 +801,10 @@ define void @v_shuffle_v3p3_v3p3__5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -857,12 +845,11 @@ define void @v_shuffle_v3p3_v3p3__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -994,13 +981,12 @@ define void @v_shuffle_v3p3_v3p3__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1041,12 +1027,11 @@ define void @v_shuffle_v3p3_v3p3__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1088,14 +1073,12 @@ define void @v_shuffle_v3p3_v3p3__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1145,16 +1128,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1376,16 +1358,15 @@ define void @v_shuffle_v3p3_v3p3__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1574,10 +1555,10 @@ define void @v_shuffle_v3p3_v3p3__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1704,15 +1685,14 @@ define void @v_shuffle_v3p3_v3p3__4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1816,15 +1796,14 @@ define void @v_shuffle_v3p3_v3p3__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1872,16 +1851,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1931,16 +1908,15 @@ define void @v_shuffle_v3p3_v3p3__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1989,16 +1965,15 @@ define void @v_shuffle_v3p3_v3p3__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2100,12 +2075,12 @@ define void @v_shuffle_v3p3_v3p3__u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2184,13 +2159,12 @@ define void @v_shuffle_v3p3_v3p3__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2232,10 +2206,10 @@ define void @v_shuffle_v3p3_v3p3__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2274,12 +2248,12 @@ define void @v_shuffle_v3p3_v3p3__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2317,15 +2291,14 @@ define void @v_shuffle_v3p3_v3p3__4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2374,13 +2347,13 @@ define void @v_shuffle_v3p3_v3p3__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2480,16 +2453,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2591,15 +2563,15 @@ define void @v_shuffle_v3p3_v3p3__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2751,12 +2723,11 @@ define void @v_shuffle_v3p3_v3p3__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2895,13 +2866,12 @@ define void @v_shuffle_v3p3_v3p3__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2988,16 +2958,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3103,10 +3071,8 @@ define void @v_shuffle_v3p3_v3p3__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3306,9 +3272,8 @@ define void @v_shuffle_v3p3_v3p3__1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3456,10 +3421,10 @@ define void @v_shuffle_v3p3_v3p3__4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3545,13 +3510,12 @@ define void @v_shuffle_v3p3_v3p3__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3592,15 +3556,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3705,14 +3669,13 @@ define void @v_shuffle_v3p3_v3p3__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3760,13 +3723,12 @@ define void @v_shuffle_v3p3_v3p3__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3807,12 +3769,12 @@ define void @v_shuffle_v3p3_v3p3__u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__u_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3913,9 +3875,9 @@ define void @v_shuffle_v3p3_v3p3__1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3963,15 +3925,15 @@ define void @v_shuffle_v3p3_v3p3__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4060,13 +4022,12 @@ define void @v_shuffle_v3p3_v3p3__4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__4_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4150,15 +4111,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4262,15 +4223,15 @@ define void @v_shuffle_v3p3_v3p3__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4316,14 +4277,13 @@ define void @v_shuffle_v3p3_v3p3__5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll index fa422e48bbce0..89ce868b03546 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll @@ -103,12 +103,11 @@ define void @v_shuffle_v4bf16_v3bf16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -203,12 +202,11 @@ define void @v_shuffle_v4bf16_v3bf16__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -567,16 +565,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -744,14 +741,13 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1606,16 +1602,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2639,16 +2634,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3017,16 +3011,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3077,16 +3070,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3697,16 +3689,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4459,12 +4450,11 @@ define void @v_shuffle_v4bf16_v3bf16__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5982,14 +5972,13 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6223,14 +6212,13 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll index ab297c02fe3b5..8e24d6e02f3ff 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll @@ -100,12 +100,11 @@ define void @v_shuffle_v4bf16_v4bf16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -238,12 +237,11 @@ define void @v_shuffle_v4bf16_v4bf16__6_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__6_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -721,16 +719,15 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -950,14 +947,13 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3870,16 +3866,15 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3928,16 +3923,15 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7088,12 +7082,11 @@ define void @v_shuffle_v4bf16_v4bf16__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__2_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9087,14 +9080,13 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9377,14 +9369,13 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll index e91433ac4c1f7..d1ff8c658c77d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll @@ -103,12 +103,11 @@ define void @v_shuffle_v4f16_v3f16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -203,12 +202,11 @@ define void @v_shuffle_v4f16_v3f16__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -567,16 +565,15 @@ define void @v_shuffle_v4f16_v3f16__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -744,14 +741,13 @@ define void @v_shuffle_v4f16_v3f16__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1606,16 +1602,15 @@ define void @v_shuffle_v4f16_v3f16__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2639,16 +2634,15 @@ define void @v_shuffle_v4f16_v3f16__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3017,16 +3011,15 @@ define void @v_shuffle_v4f16_v3f16__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3077,16 +3070,15 @@ define void @v_shuffle_v4f16_v3f16__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3697,16 +3689,15 @@ define void @v_shuffle_v4f16_v3f16__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4459,12 +4450,11 @@ define void @v_shuffle_v4f16_v3f16__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5982,14 +5972,13 @@ define void @v_shuffle_v4f16_v3f16__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6223,14 +6212,13 @@ define void @v_shuffle_v4f16_v3f16__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll index 47100b9983559..8a9a0d1a7ef5d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll @@ -100,12 +100,11 @@ define void @v_shuffle_v4f16_v4f16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -238,12 +237,11 @@ define void @v_shuffle_v4f16_v4f16__6_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__6_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -721,16 +719,15 @@ define void @v_shuffle_v4f16_v4f16__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -950,14 +947,13 @@ define void @v_shuffle_v4f16_v4f16__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3870,16 +3866,15 @@ define void @v_shuffle_v4f16_v4f16__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3928,16 +3923,15 @@ define void @v_shuffle_v4f16_v4f16__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7088,12 +7082,11 @@ define void @v_shuffle_v4f16_v4f16__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__2_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9087,14 +9080,13 @@ define void @v_shuffle_v4f16_v4f16__7_7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9377,14 +9369,13 @@ define void @v_shuffle_v4f16_v4f16__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll index 7c8417837f788..5828e40595f9f 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll @@ -3985,12 +3985,11 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4004,11 +4003,10 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -4105,12 +4103,11 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4124,12 +4121,11 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -6709,16 +6705,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6726,17 +6722,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6871,28 +6867,29 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll index 7b3a5a879f44f..1a7e281e7e138 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll @@ -103,12 +103,11 @@ define void @v_shuffle_v4i16_v3i16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -203,12 +202,11 @@ define void @v_shuffle_v4i16_v3i16__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -567,16 +565,15 @@ define void @v_shuffle_v4i16_v3i16__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -744,14 +741,13 @@ define void @v_shuffle_v4i16_v3i16__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1606,16 +1602,15 @@ define void @v_shuffle_v4i16_v3i16__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2639,16 +2634,15 @@ define void @v_shuffle_v4i16_v3i16__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3017,16 +3011,15 @@ define void @v_shuffle_v4i16_v3i16__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3077,16 +3070,15 @@ define void @v_shuffle_v4i16_v3i16__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3697,16 +3689,15 @@ define void @v_shuffle_v4i16_v3i16__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4459,12 +4450,11 @@ define void @v_shuffle_v4i16_v3i16__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5982,14 +5972,13 @@ define void @v_shuffle_v4i16_v3i16__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6223,14 +6212,13 @@ define void @v_shuffle_v4i16_v3i16__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll index 2a371b7c7d2d3..05ebf49b997eb 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll @@ -100,12 +100,11 @@ define void @v_shuffle_v4i16_v4i16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -238,12 +237,11 @@ define void @v_shuffle_v4i16_v4i16__6_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__6_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -721,16 +719,15 @@ define void @v_shuffle_v4i16_v4i16__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -950,14 +947,13 @@ define void @v_shuffle_v4i16_v4i16__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3870,16 +3866,15 @@ define void @v_shuffle_v4i16_v4i16__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3928,16 +3923,15 @@ define void @v_shuffle_v4i16_v4i16__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7088,12 +7082,11 @@ define void @v_shuffle_v4i16_v4i16__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__2_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9087,14 +9080,13 @@ define void @v_shuffle_v4i16_v4i16__7_7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9377,14 +9369,13 @@ define void @v_shuffle_v4i16_v4i16__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll index f7149350e74d3..3a659e1753e97 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll @@ -3985,12 +3985,11 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4004,11 +4003,10 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -4105,12 +4103,11 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4124,12 +4121,11 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -6709,16 +6705,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6726,17 +6722,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6871,28 +6867,29 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll index aa9e23b971823..f1c1e4b20f242 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll @@ -3985,12 +3985,11 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4004,11 +4003,10 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -4105,12 +4103,11 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4124,12 +4121,11 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -6709,16 +6705,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6726,17 +6722,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6871,28 +6867,29 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll index 1b4ed67eb6eea..94448411cfd0e 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll @@ -2625,12 +2625,11 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v2, 0 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v8 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v3, v[8:9] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v6, v2, v[9:10] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v4, v[8:9] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2653,12 +2652,11 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v2, 0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v3, v[8:9] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v6, v2, v[9:10] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v4, v[8:9] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2703,13 +2701,12 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v3i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v7, v1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v6, v2, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v6, v3, v[9:10] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v7, v2, v[9:10] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v8, v5, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v0, v2, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v0, v3, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v7, v4, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v6, v2, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v7, v5, v[1:2] ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v4, v[5:6] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2731,16 +2728,16 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v3i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v6, v2, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v8, v4, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v6, v3, v[9:10] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v7, v2, v[10:11] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v8, v5, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v11, v4, v[6:7] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v0, v2, 0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v0, v3, v[8:9] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v7, v4, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v6, v2, v[9:10] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v7, v5, v[1:2] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v10, v4, v[8:9] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2810,18 +2807,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX7-GISEL-LABEL: test_vector_reduce_mul_v4i64: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v4, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v2, v6, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v2, v6, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v0, v4, 0 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v8, v1 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v0, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v2, v7, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v0, v5, v[12:13] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v9, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[15:16] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v17, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v9, v[3:4] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -2847,18 +2842,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX8-GISEL-LABEL: test_vector_reduce_mul_v4i64: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v4, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v2, v6, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v2, v6, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v0, v4, 0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v0, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v2, v7, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v0, v5, v[12:13] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v9, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[15:16] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v17, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v9, v[3:4] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -2915,19 +2908,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v4i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v2, v6, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v8, v4, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v2, v7, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v13 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v12, v10, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v8, v5, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v6, v[14:15] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v9, v4, v[7:8] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v12, v2, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v3, v10, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v2, v6, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v0, v4, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v8, v1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v7, v[10:11] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v0, v5, v[12:13] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v6, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v11, v9, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v8, v4, v[12:13] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v11, v2, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v3, v9, v[1:2] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -2953,23 +2943,19 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v4i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v2, v6, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v8, v4, 0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v2, v7, v[0:1] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, v13 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v12, v10, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v8, v5, v[2:3] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v3, v6, v[14:15] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v9, v4, v[15:16] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v2, v6, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v0, v4, 0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v8, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v2, v7, v[10:11] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v0, v5, v[12:13] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v11, v9, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v12, v7, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v5, v10, v[3:4] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v3, v6, v[13:14] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v4, v[14:15] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v11, v15, v[1:2] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v5, v9, v[3:4] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -3070,29 +3056,26 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v21 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v5, v12, v[18:19] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v8, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v16, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v8 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v4, v20, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v14, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v15, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v10, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v7, v14, v[19:20] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v2, v11, v[6:7] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v4, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v5, v23, v[7:8] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v3, v10, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v8, v[17:18] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v23, v4, v[19:20] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v16, v[21:22] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v7, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[4:5] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3139,29 +3122,26 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v21 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v5, v12, v[18:19] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v8, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v16, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v8 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v4, v20, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v14, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v15, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v10, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v7, v14, v[19:20] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v2, v11, v[6:7] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v4, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v5, v23, v[7:8] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v3, v10, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v8, v[17:18] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v23, v4, v[19:20] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v16, v[21:22] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v7, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[4:5] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3266,34 +3246,27 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v8i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[17:18], s4, v0, v8, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[19:20], s4, v2, v10, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v16, v1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[21:22], s4, v6, v14, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[23:24], s4, v4, v12, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v18 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v18, v20 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[25:26], s4, v0, v9, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v11, v[18:19] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v22 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v24 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[29:30], s4, v4, v13, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[26:27], s4, v6, v15, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[27:28], s4, v19, v21, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[30:31], s4, v17, v23, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v10, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v7, v14, v[26:27] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v28 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v5, v12, v[29:30] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, v31 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v19, v6, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v16, v8, v[25:26] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v30, v27, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v17, v3, v[4:5] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v2, v21, v[5:6] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v6, v23, v[3:4] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v30, v4, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v27, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], s4, v6, v14, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[18:19], s4, v0, v8, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[20:21], s4, v2, v10, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[22:23], s4, v4, v12, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[24:25], s4, v6, v15, v[17:18] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[25:26], s4, v0, v9, v[19:20] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[26:27], s4, v2, v11, v[21:22] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[27:28], s4, v4, v13, v[23:24] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v7, v14, v[24:25] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v20, v16, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v3, v10, v[26:27] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v5, v12, v[27:28] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v18, v22, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v20, v13, v[7:8] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v1, v8, v[25:26] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v18, v4, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v9, v16, v[10:11] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v6, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v7, v22, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v4, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v6, v[1:2] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3337,39 +3310,34 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v8i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[17:18], null, v0, v8, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[19:20], null, v2, v10, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[21:22], null, v6, v14, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[23:24], null, v4, v12, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v16, v1 :: v_dual_mov_b32 v1, v18 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v18, v20 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[25:26], null, v0, v9, v[1:2] -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, v22 :: v_dual_mov_b32 v1, v24 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[26:27], null, v2, v11, v[18:19] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[27:28], null, v6, v15, v[0:1] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[30:31], null, v4, v13, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[28:29], null, v19, v21, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[31:32], null, v17, v23, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v7, v14, v[27:28] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v3, v10, v[26:27] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v5, v12, v[30:31] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v29 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, v32 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v16, v8, v[25:26] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v19, v0, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v31, v28, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v17, v2, v[3:4] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v6, v21, v[4:5] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v9, v23, v[7:8] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v31, v2, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v28, v[4:5] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[16:17], null, v6, v14, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[18:19], null, v0, v8, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[20:21], null, v2, v10, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[22:23], null, v4, v12, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[24:25], null, v6, v15, v[17:18] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[25:26], null, v0, v9, v[19:20] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[26:27], null, v2, v11, v[21:22] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[27:28], null, v4, v13, v[23:24] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[28:29], null, v7, v14, v[24:25] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v20, v16, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v3, v10, v[26:27] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v5, v12, v[27:28] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v18, v22, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v20, v28, v[7:8] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v1, v8, v[25:26] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v2, v6, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v18, v9, v[3:4] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v13, v16, v[4:5] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v10, v22, v[7:8] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v2, v8, v[1:2] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v6, v[4:5] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3551,60 +3519,49 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v0, v17, v[9:10] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v8, v9 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[36:37], s[4:5], v8, v35, v[25:26] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[32:33] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[36:37] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2] -; GFX7-GISEL-NEXT: buffer_load_dword v27, off, s[0:3], s32 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v32 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v11, v26, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v18, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v19, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v0, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v8, v31, v[2:3] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v18, v[10:11] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v8, v0, v[25:26] +; GFX7-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v29, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v20, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v13, v28, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v2, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v4, v21, v[9:10] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[27:28], s[4:5], v8, v18, v[11:12] ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v5 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v0, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v30, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v22, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v23, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v3, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v13, v[12:13] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v20, v[25:26] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v22, v[14:15] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v2, v[27:28] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[18:19] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v11, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v10, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v7, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v24, v12, v[6:7] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v11, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v3, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v6, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -3696,60 +3653,49 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v0, v17, v[9:10] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v8, v9 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[36:37], s[4:5], v8, v35, v[25:26] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[32:33] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[36:37] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2] -; GFX8-GISEL-NEXT: buffer_load_dword v27, off, s[0:3], s32 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v9 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v32 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v11, v26, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v18, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v19, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v0, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v8, v31, v[2:3] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v18, v[10:11] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v8, v0, v[25:26] +; GFX8-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v29, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v20, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v13, v28, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v2, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v4, v21, v[9:10] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[27:28], s[4:5], v8, v18, v[11:12] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v5 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v0, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v30, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v22, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v23, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v3, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v13, v[12:13] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v20, v[25:26] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v22, v[14:15] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v2, v[27:28] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[18:19] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v11, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v10, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v7, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v24, v12, v[6:7] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v11, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v3, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v6, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -3956,66 +3902,53 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v16i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[31:32], s4, v0, v16, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[33:34], s4, v2, v18, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[38:39], s4, v6, v22, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[35:36], s4, v0, v17, v[32:33] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[36:37], s4, v4, v20, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v34 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], s4, v1, v16, v[35:36] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v19, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v37 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v39 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[34:35], s4, v8, v24, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[48:49], s4, v4, v21, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[49:50], s4, v10, v26, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v6, v23, v[2:3] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v35 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v20, v[48:49] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[51:52], s4, v8, v25, v[2:3] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v50 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[52:53], s4, v10, v27, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[53:54], s4, v12, v28, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v9, v24, v[51:52] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v54 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v11, v26, v[52:53] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[19:20], s4, v36, v53, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[54:55], s4, v12, v29, v[2:3] -; GFX10-GISEL-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v18, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v14, v30, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[17:18], s4, v33, v49, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v7, v22, v[1:2] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v31, v34, 0 +; GFX10-GISEL-NEXT: buffer_load_dword v35, off, s[0:3], s32 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[31:32], s4, v14, v30, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[33:34], s4, v6, v22, 0 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v14, v12, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v38, v3, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v13, v28, v[54:55] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v15, v30, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v18 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v31, v9, v[1:2] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v33, v10, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v38, v14, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[20:21], s4, v17, v11, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v36, v13, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v7, v19, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v6, v3, v[14:15] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v21 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v2, v49, v[9:10] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v13 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v5, v53, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v17, v3, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v12, v20, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v7, v4, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v16, v34, v[8:9] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v9, v11, v[5:6] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v13, v19, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v12, v3, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v4, v20, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[36:37], s4, v14, v35, v[32:33] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[37:38], s4, v6, v23, v[34:35] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[34:35], s4, v33, v31, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[38:39], s4, v15, v30, v[36:37] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v0, v16, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[36:37], s4, v7, v22, v[37:38] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v2, v18, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[22:23], s4, v4, v20, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[32:33], s4, v33, v38, v[35:36] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[37:38], s4, v0, v17, v[15:16] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[38:39], s4, v2, v19, v[7:8] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[35:36], s4, v36, v31, v[32:33] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[30:31], s4, v8, v24, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[32:33], s4, v10, v26, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[15:16], s4, v1, v16, v[37:38] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], s4, v3, v18, v[38:39] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v12, v28, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v4, v21, v[23:24] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v8, v25, v[31:32] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v10, v27, v[33:34] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[17:18], s4, v12, v29, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[18:19], s4, v5, v20, v[2:3] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[19:20], s4, v9, v24, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v11, v26, v[7:8] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v6, v32, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v13, v28, v[17:18] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v22, v0, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v14, v30, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v6, v9, v[4:5] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v22, v10, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v3, v34, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v14, v19, v[2:3] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v1, v7, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v16, v32, v[11:12] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v18, v0, v[12:13] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v15, v30, v[13:14] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v35, v[9:10] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v1, v11, v[5:6] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v8, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v10, v34, v[2:3] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v12, v7, v[5:6] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v4, v2, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v5, v8, v[1:2] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -4096,66 +4029,62 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v16i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: scratch_load_b32 v71, off, s32 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[31:32], null, v0, v16, 0 +; GFX11-GISEL-NEXT: scratch_load_b32 v55, off, s32 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[33:34], null, v2, v18, 0 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[35:36], null, v4, v20, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[31:32], null, v0, v16, 0 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[37:38], null, v6, v22, 0 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[50:51], null, v10, v26, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[82:83], null, v2, v19, v[34:35] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[70:71], null, v0, v17, v[32:33] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[83:84], null, v4, v21, v[36:37] ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[52:53], null, v12, v28, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[64:65], null, v14, v30, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[66:67], null, v33, v50, 0 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[48:49], null, v8, v24, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[54:55], null, v14, v30, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[82:83], null, v0, v17, v[32:33] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[83:84], null, v2, v19, v[34:35] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[84:85], null, v4, v21, v[36:37] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[85:86], null, v6, v23, v[38:39] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[86:87], null, v10, v27, v[51:52] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[65:66], null, v31, v48, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[84:85], null, v6, v23, v[38:39] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[96:97], null, v1, v16, v[70:71] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v3, v18, v[82:83] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[85:86], null, v10, v27, v[51:52] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[86:87], null, v12, v29, v[53:54] ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[38:39], null, v8, v25, v[49:50] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v64, v55 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[96:97], null, v12, v29, v[53:54] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[97:98], null, v1, v16, v[82:83] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v18, v[83:84] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v5, v20, v[84:85] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v7, v22, v[85:86] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[67:68], null, v33, v50, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[80:81], null, v37, v54, 0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v66 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[69:70], null, v35, v52, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[80:81], null, v37, v64, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[16:17], null, v5, v20, v[83:84] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[68:69], null, v35, v52, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[17:18], null, v7, v22, v[84:85] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v9, v24, v[38:39] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v13, v28, v[86:87] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[53:54], null, v31, v48, 0 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v14, v71, v[64:65] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v9, v24, v[38:39] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v11, v26, v[86:87] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v13, v28, v[96:97] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v15, v30, v[4:5] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v68 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v31, v5, v[0:1] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v81 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v33, v6, v[4:5] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v70 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v14, v55, v[65:66] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v15, v30, v[1:2] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v11, v26, v[85:86] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v53, v68, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v37, v4, v[81:82] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v33, v1, v[67:68] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v35, v7, v[69:70] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v66, v80, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v17, v64, v[8:9] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v31, v6, v[54:55] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v0, v50, v[9:10] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v37, v8, v[0:1] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v67, v80, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v35, v7, v[4:5] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v65, v69, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v66, v11, v[5:6] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v16, v52, v[10:11] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v96, v48, v[7:8] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v53, v0, v[3:4] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v12, v80, v[8:9] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v3, v54, v[5:6] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v1, v50, v[10:11] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v2, v52, v[13:14] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, v7 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v67, v14, v[0:1] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v97, v48, v[9:10] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v6, v11, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v65, v4, v[2:3] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v3, v80, v[7:8] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v12, v69, v[8:9] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v6, v4, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v9, v11, v[7:8] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v5, v68, v[6:7] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v2, v9, v[1:2] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v7, v4, v[5:6] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v16i64: From 5c3323a59fd2215f5465b28825cec9f6c417f029 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Tue, 11 Nov 2025 22:55:24 +0100 Subject: [PATCH 52/64] [libc++] Remove (#164595) `` is provided by the compiler and both Clang and GCC provide C++-aware versions of these headers, making our own wrapper header entirely unnecessary. --- libcxx/include/CMakeLists.txt | 1 - libcxx/include/module.modulemap.in | 4 --- libcxx/include/stdbool.h | 44 ------------------------------ 3 files changed, 49 deletions(-) delete mode 100644 libcxx/include/stdbool.h diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 6d3036dfedddf..131ba99357d62 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -1066,7 +1066,6 @@ set(files sstream stack stdatomic.h - stdbool.h stddef.h stdexcept stdio.h diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in index 27f60e0c0a055..7ca57f6455dd8 100644 --- a/libcxx/include/module.modulemap.in +++ b/libcxx/include/module.modulemap.in @@ -2437,10 +2437,6 @@ module std_stdatomic_h [system] { header "stdatomic.h" export * } -module std_stdbool_h [system] { - // 's __bool_true_false_are_defined macro requires textual inclusion. - textual header "stdbool.h" -} module std_stddef_h [system] { // supports being included multiple times with different pre-defined macros textual header "stddef.h" diff --git a/libcxx/include/stdbool.h b/libcxx/include/stdbool.h deleted file mode 100644 index 768d08247256a..0000000000000 --- a/libcxx/include/stdbool.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- C++ -*- -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP_STDBOOL_H -#define _LIBCPP_STDBOOL_H - -/* - stdbool.h synopsis - -Macros: - - __bool_true_false_are_defined - -*/ - -#if defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS) -# include <__cxx03/__config> -#else -# include <__config> -#endif - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif - -#if __has_include_next() -# include_next -#endif - -#ifdef __cplusplus -# undef bool -# undef true -# undef false -# undef __bool_true_false_are_defined -# define __bool_true_false_are_defined 1 -#endif - -#endif // _LIBCPP_STDBOOL_H From 47a3ea43f522ba6d9f7732b71de5df8bd8c1db48 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 11 Nov 2025 13:57:27 -0800 Subject: [PATCH 53/64] [SPIRV] Use MCRegister instead of unsigned. NFC (#167585) --- llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h index 0c9c3bc51f433..8f2ad48efa9d7 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h @@ -16,11 +16,12 @@ #include "MCTargetDesc/SPIRVBaseInfo.h" #include "llvm/ADT/DenseSet.h" #include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCRegister.h" namespace llvm { class SPIRVInstPrinter : public MCInstPrinter { private: - SmallDenseMap ExtInstSetIDs; + SmallDenseMap ExtInstSetIDs; void recordOpExtInstImport(const MCInst *MI); public: From 519cf3c2b8f25768916d97650f148a66db0bba6f Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 11 Nov 2025 22:07:47 +0000 Subject: [PATCH 54/64] [VPlan] Remove unneeded getDefiningRecipe with isa/cast/dyn_cast. (NFC) Classof for most recipes directly supports VPValue, so there is no need to call getDefiningRecipe when using isa/cast/dyn_cast. --- .../Transforms/Vectorize/LoopVectorize.cpp | 6 ++--- llvm/lib/Transforms/Vectorize/VPlan.cpp | 2 +- .../Vectorize/VPlanConstruction.cpp | 4 ++-- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 13 +++++------ .../Transforms/Vectorize/VPlanTransforms.cpp | 22 +++++++++---------- 5 files changed, 21 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 14cea1c8fa67f..83dee09f94b99 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8228,9 +8228,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(VPInstruction *Reduction, VPValue *BinOp = Reduction->getOperand(0); VPValue *Accumulator = Reduction->getOperand(1); - VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe(); - if (isa(BinOpRecipe) || - isa(BinOpRecipe)) + if (isa(BinOp) || isa(BinOp)) std::swap(BinOp, Accumulator); assert(ScaleFactor == @@ -8798,7 +8796,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( // with fewer lanes than the VF. So the operands of the select would have // different numbers of lanes. Partial reductions mask the input instead. if (!PhiR->isInLoop() && CM.foldTailByMasking() && - !isa(OrigExitingVPV->getDefiningRecipe())) { + !isa(OrigExitingVPV)) { VPValue *Cond = RecipeBuilder.getBlockInMask(PhiR->getParent()); std::optional FMFs = PhiTy->isFloatingPointTy() diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 90696ffc3aca7..62dacf912e210 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1428,7 +1428,7 @@ void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const { void VPSlotTracker::assignName(const VPValue *V) { assert(!VPValue2Name.contains(V) && "VPValue already has a name!"); auto *UV = V->getUnderlyingValue(); - auto *VPI = dyn_cast_or_null(V->getDefiningRecipe()); + auto *VPI = dyn_cast_or_null(V); if (!UV && !(VPI && !VPI->getName().empty())) { VPValue2Name[V] = (Twine("vp<%") + Twine(NextSlot) + ">").str(); NextSlot++; diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 663e31a499b01..92ff0dcf67927 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -798,8 +798,8 @@ void VPlanTransforms::addMinimumVectorEpilogueIterationCheck( bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * { - auto *MinMaxR = dyn_cast( - RedPhiR->getBackedgeValue()->getDefiningRecipe()); + auto *MinMaxR = + dyn_cast_or_null(RedPhiR->getBackedgeValue()); if (!MinMaxR) return nullptr; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 9a5591feb3d05..beae8051e75dc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -356,7 +356,7 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, // recipe. auto HandleWiden = [&](VPWidenRecipe *Widen) { if (match(Widen, m_Sub(m_ZeroInt(), m_VPValue(Op)))) { - Widen = dyn_cast(Op->getDefiningRecipe()); + Widen = dyn_cast(Op); } Opcode = Widen->getOpcode(); VPRecipeBase *ExtAR = Widen->getOperand(0)->getDefiningRecipe(); @@ -381,11 +381,10 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, InputTypeA = Ctx.Types.inferScalarType(OpR->getOperand(0)); ExtAType = GetExtendKind(OpR); } else if (isa(OpR)) { - auto RedPhiOp1R = getOperand(1)->getDefiningRecipe(); - if (isa(RedPhiOp1R)) { + if (auto RedPhiOp1R = dyn_cast_or_null(getOperand(1))) { InputTypeA = Ctx.Types.inferScalarType(RedPhiOp1R->getOperand(0)); ExtAType = GetExtendKind(RedPhiOp1R); - } else if (auto Widen = dyn_cast(RedPhiOp1R)) + } else if (auto Widen = dyn_cast_or_null(getOperand(1))) HandleWiden(Widen); } else if (auto Widen = dyn_cast(OpR)) { HandleWiden(Widen); @@ -3195,10 +3194,10 @@ bool VPReplicateRecipe::shouldPack() const { static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE, const Loop *L) { auto *PtrR = Ptr->getDefiningRecipe(); - if (!PtrR || !((isa(PtrR) && - cast(PtrR)->getOpcode() == + if (!PtrR || !((isa(Ptr) && + cast(Ptr)->getOpcode() == Instruction::GetElementPtr) || - isa(PtrR) || + isa(Ptr) || match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue())))) return nullptr; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index a6557141c47af..bb9eed0e0ddb9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -3719,11 +3719,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, // Try to match reduce.add(mul(...)). if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) { - auto *RecipeA = - dyn_cast_if_present(A->getDefiningRecipe()); - auto *RecipeB = - dyn_cast_if_present(B->getDefiningRecipe()); - auto *Mul = cast(VecOp->getDefiningRecipe()); + auto *RecipeA = dyn_cast_if_present(A); + auto *RecipeB = dyn_cast_if_present(B); + auto *Mul = cast(VecOp); // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const))) ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul); @@ -3748,10 +3746,10 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, // Match reduce.add(ext(mul(A, B))). if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) { - auto *Ext = cast(VecOp->getDefiningRecipe()); - auto *Mul = cast(Ext->getOperand(0)->getDefiningRecipe()); - auto *Ext0 = dyn_cast_if_present(A->getDefiningRecipe()); - auto *Ext1 = dyn_cast_if_present(B->getDefiningRecipe()); + auto *Ext = cast(VecOp); + auto *Mul = cast(Ext->getOperand(0)); + auto *Ext0 = dyn_cast_if_present(A); + auto *Ext1 = dyn_cast_if_present(B); // reduce.add(ext(mul(ext, const))) // -> reduce.add(ext(mul(ext, ext(const)))) @@ -4329,12 +4327,12 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, // Check if all values feeding InterleaveR are matching wide recipes, which // operands that can be narrowed. - auto *WideMember0 = dyn_cast_or_null( - InterleaveR->getStoredValues()[0]->getDefiningRecipe()); + auto *WideMember0 = + dyn_cast_or_null(InterleaveR->getStoredValues()[0]); if (!WideMember0) return; for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) { - auto *R = dyn_cast_or_null(V->getDefiningRecipe()); + auto *R = dyn_cast_or_null(V); if (!R || R->getOpcode() != WideMember0->getOpcode() || R->getNumOperands() > 2) return; From 10679307189fb5b51980acd33ae14d70345c6c75 Mon Sep 17 00:00:00 2001 From: Sergei Barannikov Date: Wed, 12 Nov 2025 01:08:50 +0300 Subject: [PATCH 55/64] [CodeGen] Use MCRegUnit in more places (NFC) (#167578) --- llvm/include/llvm/CodeGen/LiveIntervals.h | 10 +++---- .../llvm/CodeGen/MachineRegisterInfo.h | 2 +- .../llvm/CodeGen/ReachingDefAnalysis.h | 8 +++--- llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h | 4 +-- .../include/llvm/CodeGen/TargetRegisterInfo.h | 2 +- llvm/include/llvm/MC/MCRegisterInfo.h | 4 +-- llvm/lib/CodeGen/EarlyIfConversion.cpp | 4 +-- llvm/lib/CodeGen/LiveIntervals.cpp | 8 +++--- llvm/lib/CodeGen/LiveRegMatrix.cpp | 16 ++++++------ llvm/lib/CodeGen/MachineCopyPropagation.cpp | 4 +-- llvm/lib/CodeGen/MachineRegisterInfo.cpp | 2 +- llvm/lib/CodeGen/RDFRegisters.cpp | 26 +++++++++---------- llvm/lib/CodeGen/TargetRegisterInfo.cpp | 2 +- llvm/lib/MC/MCRegisterInfo.cpp | 2 +- 14 files changed, 47 insertions(+), 47 deletions(-) diff --git a/llvm/include/llvm/CodeGen/LiveIntervals.h b/llvm/include/llvm/CodeGen/LiveIntervals.h index c252f9d99f2af..32027766e7093 100644 --- a/llvm/include/llvm/CodeGen/LiveIntervals.h +++ b/llvm/include/llvm/CodeGen/LiveIntervals.h @@ -412,7 +412,7 @@ class LiveIntervals { /// Return the live range for register unit \p Unit. It will be computed if /// it doesn't exist. - LiveRange &getRegUnit(unsigned Unit) { + LiveRange &getRegUnit(MCRegUnit Unit) { LiveRange *LR = RegUnitRanges[Unit]; if (!LR) { // Compute missing ranges on demand. @@ -425,15 +425,15 @@ class LiveIntervals { /// Return the live range for register unit \p Unit if it has already been /// computed, or nullptr if it hasn't been computed yet. - LiveRange *getCachedRegUnit(unsigned Unit) { return RegUnitRanges[Unit]; } + LiveRange *getCachedRegUnit(MCRegUnit Unit) { return RegUnitRanges[Unit]; } - const LiveRange *getCachedRegUnit(unsigned Unit) const { + const LiveRange *getCachedRegUnit(MCRegUnit Unit) const { return RegUnitRanges[Unit]; } /// Remove computed live range for register unit \p Unit. Subsequent uses /// should rely on on-demand recomputation. - void removeRegUnit(unsigned Unit) { + void removeRegUnit(MCRegUnit Unit) { delete RegUnitRanges[Unit]; RegUnitRanges[Unit] = nullptr; } @@ -489,7 +489,7 @@ class LiveIntervals { void dumpInstrs() const; void computeLiveInRegUnits(); - LLVM_ABI void computeRegUnitRange(LiveRange &, unsigned Unit); + LLVM_ABI void computeRegUnitRange(LiveRange &, MCRegUnit Unit); LLVM_ABI bool computeVirtRegInterval(LiveInterval &); using ShrinkToUsesWorkList = SmallVector, 16>; diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h index 27b30bd5929ff..6982dae4718d1 100644 --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -982,7 +982,7 @@ class MachineRegisterInfo { /// root registers, the root register and all super registers are reserved. /// This currently iterates the register hierarchy and may be slower than /// expected. - LLVM_ABI bool isReservedRegUnit(unsigned Unit) const; + LLVM_ABI bool isReservedRegUnit(MCRegUnit Unit) const; /// isAllocatable - Returns true when PhysReg belongs to an allocatable /// register class and it hasn't been reserved. diff --git a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h index d987a5cf1c3df..2893e5ce6647e 100644 --- a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h +++ b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h @@ -77,23 +77,23 @@ class MBBReachingDefsInfo { AllReachingDefs[MBBNumber].resize(NumRegUnits); } - void append(unsigned MBBNumber, unsigned Unit, int Def) { + void append(unsigned MBBNumber, MCRegUnit Unit, int Def) { AllReachingDefs[MBBNumber][Unit].push_back(Def); } - void prepend(unsigned MBBNumber, unsigned Unit, int Def) { + void prepend(unsigned MBBNumber, MCRegUnit Unit, int Def) { auto &Defs = AllReachingDefs[MBBNumber][Unit]; Defs.insert(Defs.begin(), Def); } - void replaceFront(unsigned MBBNumber, unsigned Unit, int Def) { + void replaceFront(unsigned MBBNumber, MCRegUnit Unit, int Def) { assert(!AllReachingDefs[MBBNumber][Unit].empty()); *AllReachingDefs[MBBNumber][Unit].begin() = Def; } void clear() { AllReachingDefs.clear(); } - ArrayRef defs(unsigned MBBNumber, unsigned Unit) const { + ArrayRef defs(unsigned MBBNumber, MCRegUnit Unit) const { if (AllReachingDefs[MBBNumber].empty()) // Block IDs are not necessarily dense. return ArrayRef(); diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h index ab0d7e334df44..059a3444c609c 100644 --- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h +++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h @@ -77,9 +77,9 @@ namespace llvm { struct PhysRegSUOper { SUnit *SU; int OpIdx; - unsigned RegUnit; + MCRegUnit RegUnit; - PhysRegSUOper(SUnit *su, int op, unsigned R) + PhysRegSUOper(SUnit *su, int op, MCRegUnit R) : SU(su), OpIdx(op), RegUnit(R) {} unsigned getSparseSetIndex() const { return RegUnit; } diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index f031353422e40..992d5a50a8bbf 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -1446,7 +1446,7 @@ LLVM_ABI Printable printReg(Register Reg, /// fp0~st7 - Dual roots. /// /// Usage: OS << printRegUnit(Unit, TRI) << '\n'; -LLVM_ABI Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI); +LLVM_ABI Printable printRegUnit(MCRegUnit Unit, const TargetRegisterInfo *TRI); /// Create Printable object to print virtual registers and physical /// registers on a \ref raw_ostream. diff --git a/llvm/include/llvm/MC/MCRegisterInfo.h b/llvm/include/llvm/MC/MCRegisterInfo.h index f611edd715398..e6dbb38dfee67 100644 --- a/llvm/include/llvm/MC/MCRegisterInfo.h +++ b/llvm/include/llvm/MC/MCRegisterInfo.h @@ -687,7 +687,7 @@ class MCRegUnitMaskIterator { } /// Returns a (RegUnit, LaneMask) pair. - std::pair operator*() const { + std::pair operator*() const { return std::make_pair(*RUIter, *MaskListIter); } @@ -719,7 +719,7 @@ class MCRegUnitRootIterator { public: MCRegUnitRootIterator() = default; - MCRegUnitRootIterator(unsigned RegUnit, const MCRegisterInfo *MCRI) { + MCRegUnitRootIterator(MCRegUnit RegUnit, const MCRegisterInfo *MCRI) { assert(RegUnit < MCRI->getNumRegUnits() && "Invalid register unit"); Reg0 = MCRI->RegUnitRoots[RegUnit][0]; Reg1 = MCRI->RegUnitRoots[RegUnit][1]; diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp index da0987c3b50bb..55caa6e8a8f95 100644 --- a/llvm/lib/CodeGen/EarlyIfConversion.cpp +++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp @@ -134,7 +134,7 @@ class SSAIfConv { BitVector ClobberedRegUnits; // Scratch pad for findInsertionPoint. - SparseSet LiveRegUnits; + SparseSet LiveRegUnits; /// Insertion point in Head for speculatively executed instructions form TBB /// and FBB. @@ -421,7 +421,7 @@ bool SSAIfConv::findInsertionPoint() { if (!LiveRegUnits.empty()) { LLVM_DEBUG({ dbgs() << "Would clobber"; - for (unsigned LRU : LiveRegUnits) + for (MCRegUnit LRU : LiveRegUnits) dbgs() << ' ' << printRegUnit(LRU, TRI); dbgs() << " live before " << *I; }); diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp index d2f2c3ef33c9c..27c5addffa4ab 100644 --- a/llvm/lib/CodeGen/LiveIntervals.cpp +++ b/llvm/lib/CodeGen/LiveIntervals.cpp @@ -305,7 +305,7 @@ void LiveIntervals::computeRegMasks() { /// Compute the live range of a register unit, based on the uses and defs of /// aliasing registers. The range should be empty, or contain only dead /// phi-defs from ABI blocks. -void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) { +void LiveIntervals::computeRegUnitRange(LiveRange &LR, MCRegUnit Unit) { assert(LICalc && "LICalc not initialized."); LICalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator()); @@ -354,7 +354,7 @@ void LiveIntervals::computeLiveInRegUnits() { LLVM_DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n"); // Keep track of the live range sets allocated. - SmallVector NewRanges; + SmallVector NewRanges; // Check all basic blocks for live-ins. for (const MachineBasicBlock &MBB : *MF) { @@ -383,7 +383,7 @@ void LiveIntervals::computeLiveInRegUnits() { LLVM_DEBUG(dbgs() << "Created " << NewRanges.size() << " new intervals.\n"); // Compute the 'normal' part of the ranges. - for (unsigned Unit : NewRanges) + for (MCRegUnit Unit : NewRanges) computeRegUnitRange(*RegUnitRanges[Unit], Unit); } @@ -1042,7 +1042,7 @@ class LiveIntervals::HMEditor { // physregs, even those that aren't needed for regalloc, in order to update // kill flags. This is wasteful. Eventually, LiveVariables will strip all kill // flags, and postRA passes will use a live register utility instead. - LiveRange *getRegUnitLI(unsigned Unit) { + LiveRange *getRegUnitLI(MCRegUnit Unit) { if (UpdateFlags && !MRI.isReservedRegUnit(Unit)) return &LIS.getRegUnit(Unit); return LIS.getCachedRegUnit(Unit); diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp index cfda262aac82d..e3ee8dc325933 100644 --- a/llvm/lib/CodeGen/LiveRegMatrix.cpp +++ b/llvm/lib/CodeGen/LiveRegMatrix.cpp @@ -89,7 +89,7 @@ static bool foreachUnit(const TargetRegisterInfo *TRI, Callable Func) { if (VRegInterval.hasSubRanges()) { for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { - unsigned Unit = (*Units).first; + MCRegUnit Unit = (*Units).first; LaneBitmask Mask = (*Units).second; for (const LiveInterval::SubRange &S : VRegInterval.subranges()) { if ((S.LaneMask & Mask).any()) { @@ -115,7 +115,7 @@ void LiveRegMatrix::assign(const LiveInterval &VirtReg, MCRegister PhysReg) { VRM->assignVirt2Phys(VirtReg.reg(), PhysReg); foreachUnit( - TRI, VirtReg, PhysReg, [&](unsigned Unit, const LiveRange &Range) { + TRI, VirtReg, PhysReg, [&](MCRegUnit Unit, const LiveRange &Range) { LLVM_DEBUG(dbgs() << ' ' << printRegUnit(Unit, TRI) << ' ' << Range); Matrix[Unit].unify(VirtReg, Range); return false; @@ -132,7 +132,7 @@ void LiveRegMatrix::unassign(const LiveInterval &VirtReg) { VRM->clearVirt(VirtReg.reg()); foreachUnit(TRI, VirtReg, PhysReg, - [&](unsigned Unit, const LiveRange &Range) { + [&](MCRegUnit Unit, const LiveRange &Range) { LLVM_DEBUG(dbgs() << ' ' << printRegUnit(Unit, TRI)); Matrix[Unit].extract(VirtReg, Range); return false; @@ -175,11 +175,11 @@ bool LiveRegMatrix::checkRegUnitInterference(const LiveInterval &VirtReg, return false; CoalescerPair CP(VirtReg.reg(), PhysReg, *TRI); - bool Result = foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit, - const LiveRange &Range) { - const LiveRange &UnitRange = LIS->getRegUnit(Unit); - return Range.overlaps(UnitRange, CP, *LIS->getSlotIndexes()); - }); + bool Result = foreachUnit( + TRI, VirtReg, PhysReg, [&](MCRegUnit Unit, const LiveRange &Range) { + const LiveRange &UnitRange = LIS->getRegUnit(Unit); + return Range.overlaps(UnitRange, CP, *LIS->getSlotIndexes()); + }); return Result; } diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 187bff78f236f..5ec7c48d7ee64 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -137,7 +137,7 @@ class CopyTracker { PreservedRegUnits.resize(TRI.getNumRegUnits()); for (unsigned SafeReg = 0, E = TRI.getNumRegs(); SafeReg < E; ++SafeReg) if (!RegMaskOp.clobbersPhysReg(SafeReg)) - for (auto SafeUnit : TRI.regunits(SafeReg)) + for (MCRegUnit SafeUnit : TRI.regunits(SafeReg)) PreservedRegUnits.set(SafeUnit); return PreservedRegUnits; @@ -995,7 +995,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) { // Invalidate all entries in the copy map which are not preserved by // this register mask. bool MIRefedinCopyInfo = false; - for (unsigned RegUnit : TRI->regunits(Reg)) { + for (MCRegUnit RegUnit : TRI->regunits(Reg)) { if (!PreservedRegUnits.test(RegUnit)) Tracker.clobberRegUnit(RegUnit, *TRI, *TII, UseCopyInstr); else { diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp index ae284f3ae2929..094315b3903ea 100644 --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -665,7 +665,7 @@ void MachineRegisterInfo::setCalleeSavedRegs(ArrayRef CSRs) { IsUpdatedCSRsInitialized = true; } -bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const { +bool MachineRegisterInfo::isReservedRegUnit(MCRegUnit Unit) const { const TargetRegisterInfo *TRI = getTargetRegisterInfo(); for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) { if (all_of(TRI->superregs_inclusive(*Root), diff --git a/llvm/lib/CodeGen/RDFRegisters.cpp b/llvm/lib/CodeGen/RDFRegisters.cpp index b8d54cadc07f6..1400699a607ff 100644 --- a/llvm/lib/CodeGen/RDFRegisters.cpp +++ b/llvm/lib/CodeGen/RDFRegisters.cpp @@ -58,7 +58,7 @@ PhysicalRegisterInfo::PhysicalRegisterInfo(const TargetRegisterInfo &tri, UnitInfos[U].Reg = F; } else { for (MCRegUnitMaskIterator I(F, &TRI); I.isValid(); ++I) { - std::pair P = *I; + std::pair P = *I; UnitInfo &UI = UnitInfos[P.first]; UI.Reg = F; UI.Mask = P.second; @@ -281,9 +281,9 @@ bool RegisterAggr::hasAliasOf(RegisterRef RR) const { return Units.anyCommon(PRI.getMaskUnits(RR.Reg)); for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) { - std::pair P = *U; - if ((P.second & RR.Mask).any()) - if (Units.test(P.first)) + auto [Unit, LaneMask] = *U; + if ((LaneMask & RR.Mask).any()) + if (Units.test(Unit)) return true; } return false; @@ -296,9 +296,9 @@ bool RegisterAggr::hasCoverOf(RegisterRef RR) const { } for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) { - std::pair P = *U; - if ((P.second & RR.Mask).any()) - if (!Units.test(P.first)) + auto [Unit, LaneMask] = *U; + if ((LaneMask & RR.Mask).any()) + if (!Units.test(Unit)) return false; } return true; @@ -311,9 +311,9 @@ RegisterAggr &RegisterAggr::insert(RegisterRef RR) { } for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) { - std::pair P = *U; - if ((P.second & RR.Mask).any()) - Units.set(P.first); + auto [Unit, LaneMask] = *U; + if ((LaneMask & RR.Mask).any()) + Units.set(Unit); } return *this; } @@ -384,9 +384,9 @@ RegisterRef RegisterAggr::makeRegRef() const { LaneBitmask M; for (MCRegUnitMaskIterator I(F, &PRI.getTRI()); I.isValid(); ++I) { - std::pair P = *I; - if (Units.test(P.first)) - M |= P.second; + auto [Unit, LaneMask] = *I; + if (Units.test(Unit)) + M |= LaneMask; } return RegisterRef(F, M); } diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp index 971f822fa6c53..a5c81afc57a80 100644 --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -133,7 +133,7 @@ Printable llvm::printReg(Register Reg, const TargetRegisterInfo *TRI, }); } -Printable llvm::printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) { +Printable llvm::printRegUnit(MCRegUnit Unit, const TargetRegisterInfo *TRI) { return Printable([Unit, TRI](raw_ostream &OS) { // Generic printout when TRI is missing. if (!TRI) { diff --git a/llvm/lib/MC/MCRegisterInfo.cpp b/llvm/lib/MC/MCRegisterInfo.cpp index ba9ef00f9f0d8..7fd92bf974b95 100644 --- a/llvm/lib/MC/MCRegisterInfo.cpp +++ b/llvm/lib/MC/MCRegisterInfo.cpp @@ -221,7 +221,7 @@ bool MCRegisterInfo::regsOverlap(MCRegister RegA, MCRegister RegB) const { return false; } -bool MCRegisterInfo::isArtificialRegUnit(unsigned Unit) const { +bool MCRegisterInfo::isArtificialRegUnit(MCRegUnit Unit) const { for (MCRegUnitRootIterator Root(Unit, this); Root.isValid(); ++Root) if (isArtificial(*Root)) return true; From 2308d16fdc602502dbd22f162b79dbb62063e8a2 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 11 Nov 2025 14:22:24 -0800 Subject: [PATCH 56/64] AMDGPU: Regenerate test checks after bbde79278 (#167590) Merge chasing latest versions of bulk test updates --- .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 2265 ++++++++--------- .../CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll | 203 +- .../CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll | 520 ++-- .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 442 ++-- .../CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll | 234 +- .../CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll | 212 +- 6 files changed, 1854 insertions(+), 2022 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 9b329b338d090..d3ebd92f0677b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -234844,1386 +234844,1235 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v21 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; SI-NEXT: v_mov_b32_e32 v63, v23 -; SI-NEXT: v_mov_b32_e32 v46, v20 -; SI-NEXT: v_mov_b32_e32 v41, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v46 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v63 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v30 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v24, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 +; SI-NEXT: v_mov_b32_e32 v50, v27 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v30 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v34 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v36 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v37 ; SI-NEXT: v_mul_f32_e32 v37, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v55 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v43 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v44 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v52 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v53 -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v45 +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v47 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v57 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v59 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v40 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v43 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v44 -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s27 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v60 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB105_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v44, v53 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v42 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v35 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v62 -; SI-NEXT: v_mov_b32_e32 v35, v21 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v41 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v38 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v30 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v54 -; SI-NEXT: v_mov_b32_e32 v54, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v60 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v60 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v52 +; SI-NEXT: v_mov_b32_e32 v50, v48 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_mov_b32_e32 v38, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v23 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: v_mov_b32_e32 v52, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v47 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v61 -; SI-NEXT: v_mov_b32_e32 v47, v45 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v36 -; SI-NEXT: v_mov_b32_e32 v60, v42 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v29 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v48 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v49 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v51 -; SI-NEXT: v_mov_b32_e32 v19, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v48 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_mov_b32_e32 v30, v20 +; SI-NEXT: v_mov_b32_e32 v7, v40 +; SI-NEXT: v_mov_b32_e32 v48, v27 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v51, v56 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v57 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v45 -; SI-NEXT: v_mov_b32_e32 v45, v57 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v56 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v25 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v53 +; SI-NEXT: v_mov_b32_e32 v53, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v52 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_mov_b32_e32 v52, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v23 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v46 +; SI-NEXT: v_mov_b32_e32 v46, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v59 -; SI-NEXT: v_mov_b32_e32 v24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_mov_b32_e32 v11, v6 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v13 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v63 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v58 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v34 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v17 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v55 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_mov_b32_e32 v42, v4 -; SI-NEXT: v_mov_b32_e32 v15, v63 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v32 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v27 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_mov_b32_e32 v39, v35 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_mov_b32_e32 v5, v18 -; SI-NEXT: v_mov_b32_e32 v6, v41 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v44 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v55 +; SI-NEXT: v_mov_b32_e32 v5, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v43 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v2, v38 -; SI-NEXT: v_mov_b32_e32 v38, v31 -; SI-NEXT: s_branch .LBB105_3 -; SI-NEXT: .LBB105_2: -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v35, v21 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v5 -; SI-NEXT: v_mov_b32_e32 v30, v20 -; SI-NEXT: v_mov_b32_e32 v52, v7 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v45 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v35 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v60 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_mov_b32_e32 v9, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: s_branch .LBB105_3 +; SI-NEXT: .LBB105_2: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: v_mov_b32_e32 v7, v40 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v50, v48 +; SI-NEXT: v_mov_b32_e32 v48, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v39, v35 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v46, v9 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v58 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_mov_b32_e32 v38, v13 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: v_mov_b32_e32 v52, v63 +; SI-NEXT: v_mov_b32_e32 v5, v23 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v24, v23 -; SI-NEXT: v_mov_b32_e32 v19, v18 -; SI-NEXT: v_mov_b32_e32 v51, v56 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v47, v45 -; SI-NEXT: v_mov_b32_e32 v44, v53 -; SI-NEXT: v_mov_b32_e32 v11, v6 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v2, v38 -; SI-NEXT: v_mov_b32_e32 v60, v42 -; SI-NEXT: v_mov_b32_e32 v45, v57 -; SI-NEXT: v_mov_b32_e32 v38, v31 -; SI-NEXT: v_mov_b32_e32 v42, v4 -; SI-NEXT: v_mov_b32_e32 v6, v41 -; SI-NEXT: v_mov_b32_e32 v15, v63 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: .LBB105_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v18, v3 -; SI-NEXT: v_mov_b32_e32 v57, v9 -; SI-NEXT: v_mov_b32_e32 v23, v1 +; SI-NEXT: v_mov_b32_e32 v13, v37 ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v53, v47 -; SI-NEXT: v_mov_b32_e32 v47, v51 +; SI-NEXT: v_mov_b32_e32 v19, v48 +; SI-NEXT: v_mov_b32_e32 v63, v7 +; SI-NEXT: v_mov_b32_e32 v58, v53 +; SI-NEXT: v_mov_b32_e32 v37, v27 +; SI-NEXT: v_mov_b32_e32 v48, v49 ; SI-NEXT: s_cbranch_vccnz .LBB105_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_mov_b32_e32 v4, v52 -; SI-NEXT: v_mov_b32_e32 v52, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_mov_b32_e32 v59, v45 -; SI-NEXT: v_mov_b32_e32 v61, v46 -; SI-NEXT: v_mov_b32_e32 v46, v2 -; SI-NEXT: v_mov_b32_e32 v45, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_mov_b32_e32 v7, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_mov_b32_e32 v17, v38 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_mov_b32_e32 v37, v42 -; SI-NEXT: v_mov_b32_e32 v42, v2 -; SI-NEXT: v_mov_b32_e32 v41, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 +; SI-NEXT: v_mov_b32_e32 v11, v50 +; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_mov_b32_e32 v31, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_mov_b32_e32 v15, v44 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v62, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v57, 0x40c00000, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_mov_b32_e32 v10, v11 -; SI-NEXT: v_mov_b32_e32 v11, v24 -; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v55, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 -; SI-NEXT: v_mov_b32_e32 v12, v19 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_mov_b32_e32 v59, v58 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v58 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v61, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v24 -; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 -; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v14 -; SI-NEXT: v_mov_b32_e32 v14, v26 -; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v12 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 +; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 -; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshr_b64 v[1:2], v[33:34], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; SI-NEXT: v_lshr_b64 v[33:34], v[33:34], 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v61 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 -; SI-NEXT: v_lshr_b64 v[33:34], v[33:34], 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshr_b64 v[33:34], v[33:34], 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v60 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v33 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v37 -; SI-NEXT: v_lshr_b64 v[33:34], v[33:34], 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[33:34], v[46:47], 16 -; SI-NEXT: v_mov_b32_e32 v61, v42 -; SI-NEXT: v_mov_b32_e32 v1, v33 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v53, v33 -; SI-NEXT: v_lshr_b64 v[33:34], v[48:49], 16 -; SI-NEXT: v_mov_b32_e32 v60, v41 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v1, v33 -; SI-NEXT: v_mov_b32_e32 v38, v33 -; SI-NEXT: v_lshr_b64 v[33:34], v[22:23], 16 -; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v37 -; SI-NEXT: v_mov_b32_e32 v40, v33 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v1, v33 -; SI-NEXT: v_lshr_b64 v[33:34], v[60:61], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, v33 -; SI-NEXT: v_lshr_b64 v[33:34], v[62:63], 16 -; SI-NEXT: v_mov_b32_e32 v62, v56 -; SI-NEXT: v_mov_b32_e32 v4, v33 -; SI-NEXT: v_lshr_b64 v[33:34], v[57:58], 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v6, v33 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[33:34], v[29:30], 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v8, v33 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v42, v33 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v8, v44 -; SI-NEXT: v_lshr_b64 v[33:34], v[43:44], 16 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v10 -; SI-NEXT: v_mov_b32_e32 v58, v63 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v33 -; SI-NEXT: v_lshr_b64 v[33:34], v[55:56], 16 -; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: v_mov_b32_e32 v10, v33 -; SI-NEXT: v_mov_b32_e32 v44, v33 -; SI-NEXT: v_lshr_b64 v[33:34], v[27:28], 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v10, v33 -; SI-NEXT: v_lshr_b64 v[33:34], v[20:21], 16 -; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v24 -; SI-NEXT: v_mov_b32_e32 v12, v33 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v46, v33 -; SI-NEXT: v_lshr_b64 v[33:34], v[31:32], 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v12, v33 -; SI-NEXT: v_lshr_b64 v[33:34], v[35:36], 16 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[7:8], v[41:42], 16 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v7, v23 -; SI-NEXT: v_lshr_b64 v[22:23], v[52:53], 16 -; SI-NEXT: v_mov_b32_e32 v20, v28 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, v25 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v14, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v15, v51 -; SI-NEXT: v_lshr_b64 v[24:25], v[50:51], 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_mov_b32_e32 v57, v24 -; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshr_b64 v[11:12], v[45:46], 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[9:10], v[43:44], 16 -; SI-NEXT: v_mov_b32_e32 v8, v47 -; SI-NEXT: v_mov_b32_e32 v25, v32 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, v33 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v15, v24 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v15, v17 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v15, v16 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, v49 -; SI-NEXT: v_lshr_b64 v[48:49], v[39:40], 16 -; SI-NEXT: v_mov_b32_e32 v35, v16 -; SI-NEXT: v_lshr_b64 v[15:16], v[34:35], 16 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[15:16], v[56:57], 16 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[15:16], v[54:55], 16 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v30 -; SI-NEXT: v_lshr_b64 v[29:30], v[37:38], 16 -; SI-NEXT: .LBB105_5: ; %end -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshr_b64 v[2:3], v[27:28], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; SI-NEXT: v_mov_b32_e32 v4, v38 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[4:5], v[37:38], 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; SI-NEXT: v_mov_b32_e32 v6, v49 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshr_b64 v[6:7], v[48:49], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; SI-NEXT: v_mov_b32_e32 v8, v51 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshr_b64 v[8:9], v[50:51], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v52 +; SI-NEXT: v_lshr_b64 v[51:52], v[61:62], 16 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshr_b64 v[10:11], v[27:28], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshr_b64 v[12:13], v[37:38], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 +; SI-NEXT: v_lshr_b64 v[14:15], v[53:54], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 +; SI-NEXT: v_mov_b32_e32 v16, v33 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v18 +; SI-NEXT: v_mov_b32_e32 v18, v41 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[18:19], v[40:41], 16 +; SI-NEXT: v_mov_b32_e32 v39, v59 +; SI-NEXT: v_mov_b32_e32 v40, v60 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v20 +; SI-NEXT: v_lshr_b64 v[20:21], v[39:40], 16 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 +; SI-NEXT: v_lshr_b64 v[22:23], v[56:57], 16 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v24 +; SI-NEXT: v_lshr_b64 v[24:25], v[46:47], 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v26 +; SI-NEXT: v_lshr_b64 v[26:27], v[35:36], 16 +; SI-NEXT: v_mov_b32_e32 v27, v30 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v34 +; SI-NEXT: v_lshr_b64 v[33:34], v[29:30], 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v31 +; SI-NEXT: v_mov_b32_e32 v30, v44 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshr_b64 v[30:31], v[43:44], 16 +; SI-NEXT: v_lshr_b64 v[43:44], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[44:45], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[25:26], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshr_b64 v[27:28], v[23:24], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[21:22], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[19:20], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[17:18], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[15:16], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[13:14], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[11:12], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[31:32], v[29:30], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[7:8], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: .LBB105_5: ; %end +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index 078ba76eb1f12..22dd3a0438136 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -18735,68 +18735,68 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: v_mul_f32_e64 v15, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s23 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_lshr_b64 v[4:5], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshr_b64 v[8:9], v[6:7], 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshr_b64 v[9:10], v[2:3], 16 -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 -; SI-NEXT: v_mov_b32_e32 v2, v9 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_lshr_b64 v[2:3], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_lshr_b64 v[11:12], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[5:6], 16 ; SI-NEXT: .LBB95_3: ; %end -; SI-NEXT: v_mov_b32_e32 v2, v9 -; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v1, v11 +; SI-NEXT: v_mov_b32_e32 v3, v10 +; SI-NEXT: v_mov_b32_e32 v5, v12 +; SI-NEXT: v_mov_b32_e32 v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_branch .LBB95_2 ; ; VI-LABEL: bitcast_v8bf16_to_v8i16_scalar: @@ -24555,7 +24555,7 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: v_mul_f32_e64 v28, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v27, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 ; SI-NEXT: v_mul_f32_e64 v30, 1.0, s21 @@ -24564,87 +24564,84 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; SI-NEXT: v_mul_f32_e64 v13, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v30 +; SI-NEXT: v_lshr_b64 v[19:20], v[8:9], 16 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v28 -; SI-NEXT: v_lshr_b64 v[19:20], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[25:26], 16 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v29 -; SI-NEXT: v_lshr_b64 v[0:1], v[23:24], 16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v30 -; SI-NEXT: v_lshr_b64 v[21:22], v[13:14], 16 -; SI-NEXT: v_mov_b32_e32 v1, v19 -; SI-NEXT: v_lshr_b64 v[8:9], v[25:26], 16 -; SI-NEXT: v_mov_b32_e32 v9, v21 -; SI-NEXT: v_lshr_b64 v[16:17], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 24 ; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v27 ; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v29 -; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v19 -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v21 -; SI-NEXT: v_lshr_b64 v[3:4], v[0:1], 24 -; SI-NEXT: v_lshr_b64 v[1:2], v[0:1], 8 -; SI-NEXT: v_lshr_b64 v[11:12], v[8:9], 24 -; SI-NEXT: v_lshr_b64 v[17:18], v[8:9], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[8:9], 8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v16, 8, v23 +; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 +; SI-NEXT: v_lshr_b64 v[17:18], v[22:23], 24 +; SI-NEXT: v_lshr_b64 v[11:12], v[22:23], 8 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 -; SI-NEXT: v_lshr_b64 v[8:9], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshr_b64 v[19:20], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_mov_b32_e32 v1, v19 -; SI-NEXT: v_mov_b32_e32 v9, v21 -; SI-NEXT: v_lshr_b64 v[16:17], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[0:1], 24 -; SI-NEXT: v_lshr_b64 v[1:2], v[0:1], 8 -; SI-NEXT: v_lshr_b64 v[11:12], v[8:9], 24 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v10 -; SI-NEXT: v_lshr_b64 v[17:18], v[8:9], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[8:9], 8 -; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v19 -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v21 +; SI-NEXT: v_lshr_b64 v[19:20], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_lshr_b64 v[20:21], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 24 +; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 +; SI-NEXT: v_lshr_b64 v[17:18], v[22:23], 24 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshr_b64 v[11:12], v[22:23], 8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v16, 8, v23 ; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 ; SI-NEXT: .LBB109_3: ; %end -; SI-NEXT: v_mov_b32_e32 v2, v16 -; SI-NEXT: v_mov_b32_e32 v4, v19 -; SI-NEXT: v_mov_b32_e32 v5, v20 -; SI-NEXT: v_mov_b32_e32 v10, v17 -; SI-NEXT: v_mov_b32_e32 v12, v21 -; SI-NEXT: v_mov_b32_e32 v13, v22 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_mov_b32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v4, v20 +; SI-NEXT: v_mov_b32_e32 v5, v9 +; SI-NEXT: v_mov_b32_e32 v8, v22 +; SI-NEXT: v_mov_b32_e32 v9, v11 +; SI-NEXT: v_mov_b32_e32 v11, v17 +; SI-NEXT: v_mov_b32_e32 v12, v23 +; SI-NEXT: v_mov_b32_e32 v13, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v8bf16_to_v16i8_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index ccc6e9c7e9c16..155ec568a65d3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -29772,125 +29772,125 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v31, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v27, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v24, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v1 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v1 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshr_b64 v[8:9], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_lshr_b64 v[4:5], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v26 -; SI-NEXT: v_lshr_b64 v[12:13], v[11:12], 16 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v22 -; SI-NEXT: v_lshr_b64 v[8:9], v[7:8], 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshr_b64 v[12:13], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshr_b64 v[16:17], v[14:15], 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshr_b64 v[21:22], v[10:11], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[6:7], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[2:3], 16 -; SI-NEXT: v_mov_b32_e32 v14, v16 -; SI-NEXT: v_mov_b32_e32 v10, v21 -; SI-NEXT: v_mov_b32_e32 v6, v19 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 -; SI-NEXT: v_mov_b32_e32 v2, v17 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_lshr_b64 v[2:3], v[21:22], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[6:7], v[19:20], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; SI-NEXT: v_lshr_b64 v[10:11], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[13:14], 16 ; SI-NEXT: .LBB95_3: ; %end -; SI-NEXT: v_mov_b32_e32 v2, v17 -; SI-NEXT: v_mov_b32_e32 v6, v19 -; SI-NEXT: v_mov_b32_e32 v10, v21 -; SI-NEXT: v_mov_b32_e32 v14, v16 +; SI-NEXT: v_mov_b32_e32 v1, v23 +; SI-NEXT: v_mov_b32_e32 v3, v22 +; SI-NEXT: v_mov_b32_e32 v5, v24 +; SI-NEXT: v_mov_b32_e32 v7, v20 +; SI-NEXT: v_mov_b32_e32 v9, v25 +; SI-NEXT: v_mov_b32_e32 v11, v18 +; SI-NEXT: v_mov_b32_e32 v13, v26 +; SI-NEXT: v_mov_b32_e32 v15, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_branch .LBB95_2 ; ; VI-LABEL: bitcast_v16bf16_to_v16i16_scalar: @@ -40135,217 +40135,205 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mul_f32_e64 v56, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v40, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v46, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mul_f32_e64 v58, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v57, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v47, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v13, 1.0, s22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v44, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s27 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v62, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v1 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v29, 1.0, v0 ; SI-NEXT: v_mul_f32_e64 v21, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v46, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v56 -; SI-NEXT: v_lshr_b64 v[35:36], v[5:6], 16 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v58 -; SI-NEXT: v_lshr_b64 v[38:39], v[13:14], 16 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v60 -; SI-NEXT: v_lshr_b64 v[51:52], v[21:22], 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v61 -; SI-NEXT: v_lshr_b64 v[0:1], v[40:41], 16 -; SI-NEXT: v_lshr_b64 v[8:9], v[42:43], 16 -; SI-NEXT: v_lshr_b64 v[16:17], v[44:45], 16 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v62 -; SI-NEXT: v_lshr_b64 v[53:54], v[29:30], 16 -; SI-NEXT: v_mov_b32_e32 v1, v35 -; SI-NEXT: v_mov_b32_e32 v9, v38 -; SI-NEXT: v_mov_b32_e32 v17, v51 -; SI-NEXT: v_lshr_b64 v[24:25], v[46:47], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[11:12], v[8:9], 24 -; SI-NEXT: v_lshr_b64 v[36:37], v[8:9], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[8:9], 8 -; SI-NEXT: v_mov_b32_e32 v25, v53 -; SI-NEXT: v_lshr_b64 v[48:49], v[16:17], 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[0:1], 24 -; SI-NEXT: v_lshr_b64 v[1:2], v[0:1], 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v55 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v57 -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v35 -; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v59 -; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v38 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v61 -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v51 -; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v53 -; SI-NEXT: v_lshr_b64 v[19:20], v[16:17], 24 -; SI-NEXT: v_lshr_b64 v[17:18], v[16:17], 8 -; SI-NEXT: v_lshr_b64 v[27:28], v[24:25], 24 -; SI-NEXT: v_lshr_b64 v[49:50], v[24:25], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[24:25], 8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v46 +; SI-NEXT: v_lshr_b64 v[53:54], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v47 +; SI-NEXT: v_lshr_b64 v[39:40], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v60 +; SI-NEXT: v_lshr_b64 v[50:51], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v45 +; SI-NEXT: v_lshr_b64 v[54:55], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[42:43], v[32:33], 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v59 +; SI-NEXT: v_lshr_b64 v[51:52], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[43:44], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[53:54], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[53:54], 8 +; SI-NEXT: v_lshr_b64 v[19:20], v[39:40], 24 +; SI-NEXT: v_lshr_b64 v[34:35], v[39:40], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 24 +; SI-NEXT: v_lshr_b64 v[48:49], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[50:51], 8 +; SI-NEXT: v_lshr_b64 v[11:12], v[53:54], 24 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v45 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v47 +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v51 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v57 +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v54 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v59 +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v40 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v43 +; SI-NEXT: v_lshr_b64 v[17:18], v[39:40], 8 +; SI-NEXT: v_lshr_b64 v[27:28], v[42:43], 24 +; SI-NEXT: v_lshr_b64 v[37:38], v[42:43], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[42:43], 8 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v2 -; SI-NEXT: v_lshr_b64 v[24:25], v[0:1], 16 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 -; SI-NEXT: v_lshr_b64 v[16:17], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[42:43], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_lshr_b64 v[8:9], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[39:40], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 -; SI-NEXT: v_lshr_b64 v[51:52], v[21:22], 16 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_lshr_b64 v[53:54], v[29:30], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[13:14], 16 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshr_b64 v[35:36], v[5:6], 16 -; SI-NEXT: v_mov_b32_e32 v17, v51 -; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_mov_b32_e32 v25, v53 -; SI-NEXT: v_mov_b32_e32 v9, v38 -; SI-NEXT: v_mov_b32_e32 v1, v35 -; SI-NEXT: v_lshr_b64 v[48:49], v[16:17], 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[0:1], 24 -; SI-NEXT: v_lshr_b64 v[33:34], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[0:1], 8 -; SI-NEXT: v_lshr_b64 v[11:12], v[8:9], 24 -; SI-NEXT: v_lshr_b64 v[36:37], v[8:9], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[8:9], 8 -; SI-NEXT: v_lshr_b64 v[19:20], v[16:17], 24 -; SI-NEXT: v_lshr_b64 v[17:18], v[16:17], 8 -; SI-NEXT: v_lshr_b64 v[27:28], v[24:25], 24 -; SI-NEXT: v_lshr_b64 v[49:50], v[24:25], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[24:25], 8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v35 -; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v38 -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v51 -; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v53 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshr_b64 v[53:54], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[50:51], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshr_b64 v[54:55], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[43:44], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[53:54], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 24 +; SI-NEXT: v_lshr_b64 v[48:49], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[50:51], 8 +; SI-NEXT: v_lshr_b64 v[11:12], v[53:54], 24 +; SI-NEXT: v_lshr_b64 v[9:10], v[53:54], 8 +; SI-NEXT: v_lshr_b64 v[19:20], v[39:40], 24 +; SI-NEXT: v_lshr_b64 v[34:35], v[39:40], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[39:40], 8 +; SI-NEXT: v_lshr_b64 v[27:28], v[42:43], 24 +; SI-NEXT: v_lshr_b64 v[37:38], v[42:43], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[42:43], 8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v51 +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v54 +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v40 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v43 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 ; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 ; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v23 ; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 ; SI-NEXT: .LBB109_3: ; %end -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v2, v33 -; SI-NEXT: v_mov_b32_e32 v4, v35 +; SI-NEXT: v_mov_b32_e32 v13, v20 +; SI-NEXT: v_mov_b32_e32 v20, v40 +; SI-NEXT: v_mov_b32_e32 v24, v42 +; SI-NEXT: v_mov_b32_e32 v28, v43 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v0, v50 +; SI-NEXT: v_mov_b32_e32 v2, v48 +; SI-NEXT: v_mov_b32_e32 v4, v51 ; SI-NEXT: v_mov_b32_e32 v5, v10 +; SI-NEXT: v_mov_b32_e32 v8, v53 ; SI-NEXT: v_mov_b32_e32 v10, v36 -; SI-NEXT: v_mov_b32_e32 v12, v38 -; SI-NEXT: v_mov_b32_e32 v13, v34 -; SI-NEXT: v_mov_b32_e32 v18, v48 -; SI-NEXT: v_mov_b32_e32 v20, v51 -; SI-NEXT: v_mov_b32_e32 v21, v32 -; SI-NEXT: v_mov_b32_e32 v26, v49 -; SI-NEXT: v_mov_b32_e32 v28, v53 -; SI-NEXT: v_mov_b32_e32 v29, v37 +; SI-NEXT: v_mov_b32_e32 v12, v54 +; SI-NEXT: v_mov_b32_e32 v16, v39 +; SI-NEXT: v_mov_b32_e32 v18, v34 +; SI-NEXT: v_mov_b32_e32 v21, v33 +; SI-NEXT: v_mov_b32_e32 v26, v37 +; SI-NEXT: v_mov_b32_e32 v29, v35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v16bf16_to_v32i8_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index f482843af010a..88d521a0eaa8b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -62159,197 +62159,204 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mul_f32_e64 v57, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v56, 1.0, s17 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v17 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v63, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v62, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v61, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v40, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v59, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s17 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s29 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v40 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v46 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 -; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_lshr_b64 v[12:13], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_lshr_b64 v[20:21], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_lshr_b64 v[24:25], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshr_b64 v[28:29], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_lshr_b64 v[4:5], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v59 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 -; SI-NEXT: v_lshr_b64 v[8:9], v[7:8], 16 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v45 -; SI-NEXT: v_lshr_b64 v[12:13], v[11:12], 16 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v47 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 -; SI-NEXT: v_lshr_b64 v[20:21], v[19:20], 16 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v43 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v42 -; SI-NEXT: v_lshr_b64 v[28:29], v[27:28], 16 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v49 -; SI-NEXT: v_lshr_b64 v[16:17], v[15:16], 16 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v53 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v39 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_lshr_b64 v[24:25], v[23:24], 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v36 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v15 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v51 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v40 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshr_b64 v[32:33], v[30:31], 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshr_b64 v[54:55], v[26:27], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[22:23], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[18:19], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[14:15], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[10:11], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[6:7], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[2:3], 16 -; SI-NEXT: v_mov_b32_e32 v30, v32 -; SI-NEXT: v_mov_b32_e32 v26, v54 -; SI-NEXT: v_mov_b32_e32 v22, v52 -; SI-NEXT: v_mov_b32_e32 v18, v50 -; SI-NEXT: v_mov_b32_e32 v14, v48 -; SI-NEXT: v_mov_b32_e32 v10, v37 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; SI-NEXT: v_mov_b32_e32 v6, v35 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 -; SI-NEXT: v_mov_b32_e32 v2, v33 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 +; SI-NEXT: v_lshr_b64 v[38:39], v[38:39], 16 +; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshr_b64 v[18:19], v[38:39], 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[6:7], v[50:51], 16 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 +; SI-NEXT: v_lshr_b64 v[10:11], v[48:49], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 +; SI-NEXT: v_lshr_b64 v[14:15], v[37:38], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_lshr_b64 v[41:42], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[35:36], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshr_b64 v[42:43], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[33:34], 16 +; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; SI-NEXT: v_lshr_b64 v[43:44], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[44:45], v[21:22], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshr_b64 v[45:46], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[29:30], 16 ; SI-NEXT: .LBB95_3: ; %end +; SI-NEXT: v_mov_b32_e32 v5, v40 +; SI-NEXT: v_mov_b32_e32 v9, v41 +; SI-NEXT: v_mov_b32_e32 v13, v42 +; SI-NEXT: v_mov_b32_e32 v17, v43 +; SI-NEXT: v_mov_b32_e32 v21, v44 +; SI-NEXT: v_mov_b32_e32 v25, v45 +; SI-NEXT: v_mov_b32_e32 v29, v46 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -62366,49 +62373,56 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v2, v33 -; SI-NEXT: v_mov_b32_e32 v6, v35 -; SI-NEXT: v_mov_b32_e32 v10, v37 -; SI-NEXT: v_mov_b32_e32 v14, v48 -; SI-NEXT: v_mov_b32_e32 v18, v50 -; SI-NEXT: v_mov_b32_e32 v22, v52 -; SI-NEXT: v_mov_b32_e32 v26, v54 -; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, v54 +; SI-NEXT: v_mov_b32_e32 v3, v53 +; SI-NEXT: v_mov_b32_e32 v7, v51 +; SI-NEXT: v_mov_b32_e32 v11, v49 +; SI-NEXT: v_mov_b32_e32 v15, v38 +; SI-NEXT: v_mov_b32_e32 v19, v39 +; SI-NEXT: v_mov_b32_e32 v23, v36 +; SI-NEXT: v_mov_b32_e32 v27, v34 +; SI-NEXT: v_mov_b32_e32 v31, v32 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_branch .LBB95_2 ; ; VI-LABEL: bitcast_v32bf16_to_v32i16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll index 1dcc010349123..3d9c7681b3132 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -2272,32 +2272,30 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16 -; SI-NEXT: v_lshr_b64 v[0:1], v[4:5], 16 -; SI-NEXT: v_mov_b32_e32 v1, v8 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshr_b64 v[0:1], v[5:6], 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: .LBB23_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: @@ -5463,32 +5461,30 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16 -; SI-NEXT: v_lshr_b64 v[0:1], v[4:5], 16 -; SI-NEXT: v_mov_b32_e32 v1, v8 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshr_b64 v[0:1], v[5:6], 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: .LBB47_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: @@ -8367,32 +8363,30 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16 -; SI-NEXT: v_lshr_b64 v[0:1], v[4:5], 16 -; SI-NEXT: v_mov_b32_e32 v1, v8 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshr_b64 v[0:1], v[5:6], 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: .LBB67_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: @@ -10946,32 +10940,30 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16 -; SI-NEXT: v_lshr_b64 v[0:1], v[4:5], 16 -; SI-NEXT: v_mov_b32_e32 v1, v8 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshr_b64 v[0:1], v[5:6], 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: .LBB83_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB83_4: @@ -13162,40 +13154,40 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; SI-NEXT: v_lshr_b64 v[5:6], v[1:2], 16 ; SI-NEXT: .LBB95_3: ; %end -; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v1, v5 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_branch .LBB95_2 ; ; VI-LABEL: bitcast_v4bf16_to_v4i16_scalar: @@ -16753,53 +16745,51 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_lshr_b64 v[11:12], v[0:1], 16 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 -; SI-NEXT: v_lshr_b64 v[10:11], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[0:1], v[12:13], 16 -; SI-NEXT: v_mov_b32_e32 v1, v10 -; SI-NEXT: v_lshr_b64 v[8:9], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[0:1], 24 -; SI-NEXT: v_lshr_b64 v[1:2], v[0:1], 8 +; SI-NEXT: v_lshr_b64 v[12:13], v[5:6], 16 ; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v10 +; SI-NEXT: v_lshr_b64 v[3:4], v[11:12], 24 +; SI-NEXT: v_lshr_b64 v[9:10], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[11:12], 8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v12 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshr_b64 v[10:11], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_mov_b32_e32 v1, v10 -; SI-NEXT: v_lshr_b64 v[3:4], v[0:1], 24 -; SI-NEXT: v_lshr_b64 v[8:9], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[0:1], 8 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshr_b64 v[11:12], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshr_b64 v[12:13], v[5:6], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; SI-NEXT: v_lshr_b64 v[3:4], v[11:12], 24 +; SI-NEXT: v_lshr_b64 v[9:10], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[11:12], 8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v12 ; SI-NEXT: .LBB109_3: ; %end -; SI-NEXT: v_mov_b32_e32 v2, v8 -; SI-NEXT: v_mov_b32_e32 v4, v10 -; SI-NEXT: v_mov_b32_e32 v5, v9 +; SI-NEXT: v_mov_b32_e32 v0, v11 +; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_mov_b32_e32 v4, v12 +; SI-NEXT: v_mov_b32_e32 v5, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_branch .LBB109_2 ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index 4abb70e0ec5c9..ab629e1a4d269 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -2214,42 +2214,40 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_lshr_b64 v[0:1], v[7:8], 16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshr_b64 v[12:13], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[5:6], 16 -; SI-NEXT: v_mov_b32_e32 v2, v12 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_lshr_b64 v[0:1], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_lshr_b64 v[1:2], v[6:7], 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; SI-NEXT: v_lshr_b64 v[2:3], v[4:5], 16 ; SI-NEXT: s_cbranch_execnz .LBB11_3 ; SI-NEXT: .LBB11_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v9 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; SI-NEXT: .LBB11_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB11_4: @@ -5433,42 +5431,40 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20 ; SI-NEXT: s_cbranch_scc0 .LBB27_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_lshr_b64 v[0:1], v[7:8], 16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshr_b64 v[12:13], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[5:6], 16 -; SI-NEXT: v_mov_b32_e32 v2, v12 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_lshr_b64 v[0:1], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_lshr_b64 v[1:2], v[6:7], 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; SI-NEXT: v_lshr_b64 v[2:3], v[4:5], 16 ; SI-NEXT: s_cbranch_execnz .LBB27_3 ; SI-NEXT: .LBB27_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v9 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; SI-NEXT: .LBB27_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB27_4: @@ -8105,72 +8101,70 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: v_mul_f32_e64 v21, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 ; SI-NEXT: v_mul_f32_e64 v19, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v9, 1.0, s20 ; SI-NEXT: s_cbranch_scc0 .LBB39_4 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_lshr_b64 v[16:17], v[0:1], 16 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v21 -; SI-NEXT: v_lshr_b64 v[14:15], v[5:6], 16 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v19 -; SI-NEXT: v_lshr_b64 v[0:1], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[15:16], v[9:10], 16 -; SI-NEXT: v_mov_b32_e32 v1, v14 -; SI-NEXT: v_lshr_b64 v[3:4], v[0:1], 24 +; SI-NEXT: v_lshr_b64 v[17:18], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 24 ; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v20 ; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v19 -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v14 -; SI-NEXT: v_lshrrev_b32_e32 v16, 8, v15 -; SI-NEXT: v_lshr_b64 v[12:13], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[0:1], 8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v13 +; SI-NEXT: v_lshr_b64 v[14:15], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB39_3 ; SI-NEXT: .LBB39_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshr_b64 v[14:15], v[5:6], 16 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 -; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; SI-NEXT: v_lshr_b64 v[17:18], v[5:6], 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_mov_b32_e32 v1, v14 -; SI-NEXT: v_lshr_b64 v[15:16], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[0:1], 24 -; SI-NEXT: v_lshr_b64 v[12:13], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[0:1], 8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v14 -; SI-NEXT: v_lshrrev_b32_e32 v16, 8, v15 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshr_b64 v[13:14], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 24 +; SI-NEXT: v_lshr_b64 v[14:15], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[16:17], 8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v13 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 ; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v5 ; SI-NEXT: .LBB39_3: ; %end -; SI-NEXT: v_mov_b32_e32 v2, v12 -; SI-NEXT: v_mov_b32_e32 v4, v14 +; SI-NEXT: v_mov_b32_e32 v0, v16 +; SI-NEXT: v_mov_b32_e32 v2, v14 +; SI-NEXT: v_mov_b32_e32 v4, v17 ; SI-NEXT: v_mov_b32_e32 v5, v8 -; SI-NEXT: v_mov_b32_e32 v8, v15 -; SI-NEXT: v_mov_b32_e32 v9, v16 +; SI-NEXT: v_mov_b32_e32 v8, v13 +; SI-NEXT: v_mov_b32_e32 v9, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB39_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_branch .LBB39_2 ; @@ -12823,51 +12817,51 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 ; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s20 ; SI-NEXT: v_mul_f32_e64 v9, 1.0, s21 ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v9 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshr_b64 v[6:7], v[4:5], 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshr_b64 v[7:8], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; SI-NEXT: v_mov_b32_e32 v2, v7 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_lshr_b64 v[7:8], v[1:2], 16 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_mov_b32_e32 v2, v7 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: v_mov_b32_e32 v4, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_branch .LBB53_2 From dd1564e0e7bde9eda72320b6b839a16c3bcee642 Mon Sep 17 00:00:00 2001 From: Prabhu Rajasekaran Date: Tue, 11 Nov 2025 14:26:40 -0800 Subject: [PATCH 57/64] [MachO] Report error when there are too many sections (#167418) When there are more than 255 sections, MachO object writer allows creation of object files which are potentially malformed. Currently, there are assertions in object writer code that prevents this behavior. But for distributions where assertions are turned off this still results in creation of malformed object files. Turning assertions into explicit errors. --- llvm/lib/MC/MachObjectWriter.cpp | 14 +- llvm/test/MC/MachO/invalid-section-index.s | 1571 ++++++++++++++++++++ 2 files changed, 1582 insertions(+), 3 deletions(-) create mode 100644 llvm/test/MC/MachO/invalid-section-index.s diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp index a95ccf83a2636..a8535dfa8a5d3 100644 --- a/llvm/lib/MC/MachObjectWriter.cpp +++ b/llvm/lib/MC/MachObjectWriter.cpp @@ -28,6 +28,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/SMLoc.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -584,7 +585,12 @@ void MachObjectWriter::computeSymbolTable( unsigned Index = 1; for (MCSection &Sec : Asm) SectionIndexMap[&Sec] = Index++; - assert(Index <= 256 && "Too many sections!"); + + // Section indices begin from 1 in MachO. Only sections 1-255 can be indexed + // into section symbols. Referencing a section with index larger than 255 will + // not set n_sect for these symbols. + if (Index > 255) + getContext().reportError(SMLoc(), "Too many sections!"); // Build the string table. for (const MCSymbol &Symbol : Asm.symbols()) { @@ -621,7 +627,8 @@ void MachObjectWriter::computeSymbolTable( ExternalSymbolData.push_back(MSD); } else { MSD.SectionIndex = SectionIndexMap.lookup(&Symbol.getSection()); - assert(MSD.SectionIndex && "Invalid section index!"); + if (!MSD.SectionIndex) + getContext().reportError(SMLoc(), "Invalid section index!"); ExternalSymbolData.push_back(MSD); } } @@ -645,7 +652,8 @@ void MachObjectWriter::computeSymbolTable( LocalSymbolData.push_back(MSD); } else { MSD.SectionIndex = SectionIndexMap.lookup(&Symbol.getSection()); - assert(MSD.SectionIndex && "Invalid section index!"); + if (!MSD.SectionIndex) + getContext().reportError(SMLoc(), "Invalid section index!"); LocalSymbolData.push_back(MSD); } } diff --git a/llvm/test/MC/MachO/invalid-section-index.s b/llvm/test/MC/MachO/invalid-section-index.s new file mode 100644 index 0000000000000..55a0ce5b40ea7 --- /dev/null +++ b/llvm/test/MC/MachO/invalid-section-index.s @@ -0,0 +1,1571 @@ +/// Test that when there are more than 255 sections, error is shown specifying too many sections. + +// RUN: not llvm-mc -filetype=obj -triple arm64-apple-macos %s -o - 2>&1 | FileCheck %s --check-prefix=MACHOERROR + +// MACHOERROR: error: Too many sections! +// MACHOERROR-NEXT: error: Invalid section index! +// MACHOERROR-NEXT: error: Invalid section index! + + .section __TEXT,__text,regular,pure_instructions + .globl _main ; -- Begin function main + .p2align 2 +_main: ; @main + .cfi_startproc +; %bb.0: ; %entry + sub sp, sp, #16 + .cfi_def_cfa_offset 16 + mov w0, #0 ; =0x0 + str wzr, [sp, #12] + add sp, sp, #16 + ret + .cfi_endproc + ; -- End function + .section seg,sect0 + .globl _var0 ; @var0 + .p2align 2, 0x0 +_var0: + .long 0 ; 0x0 + + .section seg,sect1 + .globl _var1 ; @var1 + .p2align 2, 0x0 +_var1: + .long 1 ; 0x1 + + .section seg,sect2 + .globl _var2 ; @var2 + .p2align 2, 0x0 +_var2: + .long 2 ; 0x2 + + .section seg,sect3 + .globl _var3 ; @var3 + .p2align 2, 0x0 +_var3: + .long 3 ; 0x3 + + .section seg,sect4 + .globl _var4 ; @var4 + .p2align 2, 0x0 +_var4: + .long 4 ; 0x4 + + .section seg,sect5 + .globl _var5 ; @var5 + .p2align 2, 0x0 +_var5: + .long 5 ; 0x5 + + .section seg,sect6 + .globl _var6 ; @var6 + .p2align 2, 0x0 +_var6: + .long 6 ; 0x6 + + .section seg,sect7 + .globl _var7 ; @var7 + .p2align 2, 0x0 +_var7: + .long 7 ; 0x7 + + .section seg,sect8 + .globl _var8 ; @var8 + .p2align 2, 0x0 +_var8: + .long 8 ; 0x8 + + .section seg,sect9 + .globl _var9 ; @var9 + .p2align 2, 0x0 +_var9: + .long 9 ; 0x9 + + .section seg,sect10 + .globl _var10 ; @var10 + .p2align 2, 0x0 +_var10: + .long 10 ; 0xa + + .section seg,sect11 + .globl _var11 ; @var11 + .p2align 2, 0x0 +_var11: + .long 11 ; 0xb + + .section seg,sect12 + .globl _var12 ; @var12 + .p2align 2, 0x0 +_var12: + .long 12 ; 0xc + + .section seg,sect13 + .globl _var13 ; @var13 + .p2align 2, 0x0 +_var13: + .long 13 ; 0xd + + .section seg,sect14 + .globl _var14 ; @var14 + .p2align 2, 0x0 +_var14: + .long 14 ; 0xe + + .section seg,sect15 + .globl _var15 ; @var15 + .p2align 2, 0x0 +_var15: + .long 15 ; 0xf + + .section seg,sect16 + .globl _var16 ; @var16 + .p2align 2, 0x0 +_var16: + .long 16 ; 0x10 + + .section seg,sect17 + .globl _var17 ; @var17 + .p2align 2, 0x0 +_var17: + .long 17 ; 0x11 + + .section seg,sect18 + .globl _var18 ; @var18 + .p2align 2, 0x0 +_var18: + .long 18 ; 0x12 + + .section seg,sect19 + .globl _var19 ; @var19 + .p2align 2, 0x0 +_var19: + .long 19 ; 0x13 + + .section seg,sect20 + .globl _var20 ; @var20 + .p2align 2, 0x0 +_var20: + .long 20 ; 0x14 + + .section seg,sect21 + .globl _var21 ; @var21 + .p2align 2, 0x0 +_var21: + .long 21 ; 0x15 + + .section seg,sect22 + .globl _var22 ; @var22 + .p2align 2, 0x0 +_var22: + .long 22 ; 0x16 + + .section seg,sect23 + .globl _var23 ; @var23 + .p2align 2, 0x0 +_var23: + .long 23 ; 0x17 + + .section seg,sect24 + .globl _var24 ; @var24 + .p2align 2, 0x0 +_var24: + .long 24 ; 0x18 + + .section seg,sect25 + .globl _var25 ; @var25 + .p2align 2, 0x0 +_var25: + .long 25 ; 0x19 + + .section seg,sect26 + .globl _var26 ; @var26 + .p2align 2, 0x0 +_var26: + .long 26 ; 0x1a + + .section seg,sect27 + .globl _var27 ; @var27 + .p2align 2, 0x0 +_var27: + .long 27 ; 0x1b + + .section seg,sect28 + .globl _var28 ; @var28 + .p2align 2, 0x0 +_var28: + .long 28 ; 0x1c + + .section seg,sect29 + .globl _var29 ; @var29 + .p2align 2, 0x0 +_var29: + .long 29 ; 0x1d + + .section seg,sect30 + .globl _var30 ; @var30 + .p2align 2, 0x0 +_var30: + .long 30 ; 0x1e + + .section seg,sect31 + .globl _var31 ; @var31 + .p2align 2, 0x0 +_var31: + .long 31 ; 0x1f + + .section seg,sect32 + .globl _var32 ; @var32 + .p2align 2, 0x0 +_var32: + .long 32 ; 0x20 + + .section seg,sect33 + .globl _var33 ; @var33 + .p2align 2, 0x0 +_var33: + .long 33 ; 0x21 + + .section seg,sect34 + .globl _var34 ; @var34 + .p2align 2, 0x0 +_var34: + .long 34 ; 0x22 + + .section seg,sect35 + .globl _var35 ; @var35 + .p2align 2, 0x0 +_var35: + .long 35 ; 0x23 + + .section seg,sect36 + .globl _var36 ; @var36 + .p2align 2, 0x0 +_var36: + .long 36 ; 0x24 + + .section seg,sect37 + .globl _var37 ; @var37 + .p2align 2, 0x0 +_var37: + .long 37 ; 0x25 + + .section seg,sect38 + .globl _var38 ; @var38 + .p2align 2, 0x0 +_var38: + .long 38 ; 0x26 + + .section seg,sect39 + .globl _var39 ; @var39 + .p2align 2, 0x0 +_var39: + .long 39 ; 0x27 + + .section seg,sect40 + .globl _var40 ; @var40 + .p2align 2, 0x0 +_var40: + .long 40 ; 0x28 + + .section seg,sect41 + .globl _var41 ; @var41 + .p2align 2, 0x0 +_var41: + .long 41 ; 0x29 + + .section seg,sect42 + .globl _var42 ; @var42 + .p2align 2, 0x0 +_var42: + .long 42 ; 0x2a + + .section seg,sect43 + .globl _var43 ; @var43 + .p2align 2, 0x0 +_var43: + .long 43 ; 0x2b + + .section seg,sect44 + .globl _var44 ; @var44 + .p2align 2, 0x0 +_var44: + .long 44 ; 0x2c + + .section seg,sect45 + .globl _var45 ; @var45 + .p2align 2, 0x0 +_var45: + .long 45 ; 0x2d + + .section seg,sect46 + .globl _var46 ; @var46 + .p2align 2, 0x0 +_var46: + .long 46 ; 0x2e + + .section seg,sect47 + .globl _var47 ; @var47 + .p2align 2, 0x0 +_var47: + .long 47 ; 0x2f + + .section seg,sect48 + .globl _var48 ; @var48 + .p2align 2, 0x0 +_var48: + .long 48 ; 0x30 + + .section seg,sect49 + .globl _var49 ; @var49 + .p2align 2, 0x0 +_var49: + .long 49 ; 0x31 + + .section seg,sect50 + .globl _var50 ; @var50 + .p2align 2, 0x0 +_var50: + .long 50 ; 0x32 + + .section seg,sect51 + .globl _var51 ; @var51 + .p2align 2, 0x0 +_var51: + .long 51 ; 0x33 + + .section seg,sect52 + .globl _var52 ; @var52 + .p2align 2, 0x0 +_var52: + .long 52 ; 0x34 + + .section seg,sect53 + .globl _var53 ; @var53 + .p2align 2, 0x0 +_var53: + .long 53 ; 0x35 + + .section seg,sect54 + .globl _var54 ; @var54 + .p2align 2, 0x0 +_var54: + .long 54 ; 0x36 + + .section seg,sect55 + .globl _var55 ; @var55 + .p2align 2, 0x0 +_var55: + .long 55 ; 0x37 + + .section seg,sect56 + .globl _var56 ; @var56 + .p2align 2, 0x0 +_var56: + .long 56 ; 0x38 + + .section seg,sect57 + .globl _var57 ; @var57 + .p2align 2, 0x0 +_var57: + .long 57 ; 0x39 + + .section seg,sect58 + .globl _var58 ; @var58 + .p2align 2, 0x0 +_var58: + .long 58 ; 0x3a + + .section seg,sect59 + .globl _var59 ; @var59 + .p2align 2, 0x0 +_var59: + .long 59 ; 0x3b + + .section seg,sect60 + .globl _var60 ; @var60 + .p2align 2, 0x0 +_var60: + .long 60 ; 0x3c + + .section seg,sect61 + .globl _var61 ; @var61 + .p2align 2, 0x0 +_var61: + .long 61 ; 0x3d + + .section seg,sect62 + .globl _var62 ; @var62 + .p2align 2, 0x0 +_var62: + .long 62 ; 0x3e + + .section seg,sect63 + .globl _var63 ; @var63 + .p2align 2, 0x0 +_var63: + .long 63 ; 0x3f + + .section seg,sect64 + .globl _var64 ; @var64 + .p2align 2, 0x0 +_var64: + .long 64 ; 0x40 + + .section seg,sect65 + .globl _var65 ; @var65 + .p2align 2, 0x0 +_var65: + .long 65 ; 0x41 + + .section seg,sect66 + .globl _var66 ; @var66 + .p2align 2, 0x0 +_var66: + .long 66 ; 0x42 + + .section seg,sect67 + .globl _var67 ; @var67 + .p2align 2, 0x0 +_var67: + .long 67 ; 0x43 + + .section seg,sect68 + .globl _var68 ; @var68 + .p2align 2, 0x0 +_var68: + .long 68 ; 0x44 + + .section seg,sect69 + .globl _var69 ; @var69 + .p2align 2, 0x0 +_var69: + .long 69 ; 0x45 + + .section seg,sect70 + .globl _var70 ; @var70 + .p2align 2, 0x0 +_var70: + .long 70 ; 0x46 + + .section seg,sect71 + .globl _var71 ; @var71 + .p2align 2, 0x0 +_var71: + .long 71 ; 0x47 + + .section seg,sect72 + .globl _var72 ; @var72 + .p2align 2, 0x0 +_var72: + .long 72 ; 0x48 + + .section seg,sect73 + .globl _var73 ; @var73 + .p2align 2, 0x0 +_var73: + .long 73 ; 0x49 + + .section seg,sect74 + .globl _var74 ; @var74 + .p2align 2, 0x0 +_var74: + .long 74 ; 0x4a + + .section seg,sect75 + .globl _var75 ; @var75 + .p2align 2, 0x0 +_var75: + .long 75 ; 0x4b + + .section seg,sect76 + .globl _var76 ; @var76 + .p2align 2, 0x0 +_var76: + .long 76 ; 0x4c + + .section seg,sect77 + .globl _var77 ; @var77 + .p2align 2, 0x0 +_var77: + .long 77 ; 0x4d + + .section seg,sect78 + .globl _var78 ; @var78 + .p2align 2, 0x0 +_var78: + .long 78 ; 0x4e + + .section seg,sect79 + .globl _var79 ; @var79 + .p2align 2, 0x0 +_var79: + .long 79 ; 0x4f + + .section seg,sect80 + .globl _var80 ; @var80 + .p2align 2, 0x0 +_var80: + .long 80 ; 0x50 + + .section seg,sect81 + .globl _var81 ; @var81 + .p2align 2, 0x0 +_var81: + .long 81 ; 0x51 + + .section seg,sect82 + .globl _var82 ; @var82 + .p2align 2, 0x0 +_var82: + .long 82 ; 0x52 + + .section seg,sect83 + .globl _var83 ; @var83 + .p2align 2, 0x0 +_var83: + .long 83 ; 0x53 + + .section seg,sect84 + .globl _var84 ; @var84 + .p2align 2, 0x0 +_var84: + .long 84 ; 0x54 + + .section seg,sect85 + .globl _var85 ; @var85 + .p2align 2, 0x0 +_var85: + .long 85 ; 0x55 + + .section seg,sect86 + .globl _var86 ; @var86 + .p2align 2, 0x0 +_var86: + .long 86 ; 0x56 + + .section seg,sect87 + .globl _var87 ; @var87 + .p2align 2, 0x0 +_var87: + .long 87 ; 0x57 + + .section seg,sect88 + .globl _var88 ; @var88 + .p2align 2, 0x0 +_var88: + .long 88 ; 0x58 + + .section seg,sect89 + .globl _var89 ; @var89 + .p2align 2, 0x0 +_var89: + .long 89 ; 0x59 + + .section seg,sect90 + .globl _var90 ; @var90 + .p2align 2, 0x0 +_var90: + .long 90 ; 0x5a + + .section seg,sect91 + .globl _var91 ; @var91 + .p2align 2, 0x0 +_var91: + .long 91 ; 0x5b + + .section seg,sect92 + .globl _var92 ; @var92 + .p2align 2, 0x0 +_var92: + .long 92 ; 0x5c + + .section seg,sect93 + .globl _var93 ; @var93 + .p2align 2, 0x0 +_var93: + .long 93 ; 0x5d + + .section seg,sect94 + .globl _var94 ; @var94 + .p2align 2, 0x0 +_var94: + .long 94 ; 0x5e + + .section seg,sect95 + .globl _var95 ; @var95 + .p2align 2, 0x0 +_var95: + .long 95 ; 0x5f + + .section seg,sect96 + .globl _var96 ; @var96 + .p2align 2, 0x0 +_var96: + .long 96 ; 0x60 + + .section seg,sect97 + .globl _var97 ; @var97 + .p2align 2, 0x0 +_var97: + .long 97 ; 0x61 + + .section seg,sect98 + .globl _var98 ; @var98 + .p2align 2, 0x0 +_var98: + .long 98 ; 0x62 + + .section seg,sect99 + .globl _var99 ; @var99 + .p2align 2, 0x0 +_var99: + .long 99 ; 0x63 + + .section seg,sect100 + .globl _var100 ; @var100 + .p2align 2, 0x0 +_var100: + .long 100 ; 0x64 + + .section seg,sect101 + .globl _var101 ; @var101 + .p2align 2, 0x0 +_var101: + .long 101 ; 0x65 + + .section seg,sect102 + .globl _var102 ; @var102 + .p2align 2, 0x0 +_var102: + .long 102 ; 0x66 + + .section seg,sect103 + .globl _var103 ; @var103 + .p2align 2, 0x0 +_var103: + .long 103 ; 0x67 + + .section seg,sect104 + .globl _var104 ; @var104 + .p2align 2, 0x0 +_var104: + .long 104 ; 0x68 + + .section seg,sect105 + .globl _var105 ; @var105 + .p2align 2, 0x0 +_var105: + .long 105 ; 0x69 + + .section seg,sect106 + .globl _var106 ; @var106 + .p2align 2, 0x0 +_var106: + .long 106 ; 0x6a + + .section seg,sect107 + .globl _var107 ; @var107 + .p2align 2, 0x0 +_var107: + .long 107 ; 0x6b + + .section seg,sect108 + .globl _var108 ; @var108 + .p2align 2, 0x0 +_var108: + .long 108 ; 0x6c + + .section seg,sect109 + .globl _var109 ; @var109 + .p2align 2, 0x0 +_var109: + .long 109 ; 0x6d + + .section seg,sect110 + .globl _var110 ; @var110 + .p2align 2, 0x0 +_var110: + .long 110 ; 0x6e + + .section seg,sect111 + .globl _var111 ; @var111 + .p2align 2, 0x0 +_var111: + .long 111 ; 0x6f + + .section seg,sect112 + .globl _var112 ; @var112 + .p2align 2, 0x0 +_var112: + .long 112 ; 0x70 + + .section seg,sect113 + .globl _var113 ; @var113 + .p2align 2, 0x0 +_var113: + .long 113 ; 0x71 + + .section seg,sect114 + .globl _var114 ; @var114 + .p2align 2, 0x0 +_var114: + .long 114 ; 0x72 + + .section seg,sect115 + .globl _var115 ; @var115 + .p2align 2, 0x0 +_var115: + .long 115 ; 0x73 + + .section seg,sect116 + .globl _var116 ; @var116 + .p2align 2, 0x0 +_var116: + .long 116 ; 0x74 + + .section seg,sect117 + .globl _var117 ; @var117 + .p2align 2, 0x0 +_var117: + .long 117 ; 0x75 + + .section seg,sect118 + .globl _var118 ; @var118 + .p2align 2, 0x0 +_var118: + .long 118 ; 0x76 + + .section seg,sect119 + .globl _var119 ; @var119 + .p2align 2, 0x0 +_var119: + .long 119 ; 0x77 + + .section seg,sect120 + .globl _var120 ; @var120 + .p2align 2, 0x0 +_var120: + .long 120 ; 0x78 + + .section seg,sect121 + .globl _var121 ; @var121 + .p2align 2, 0x0 +_var121: + .long 121 ; 0x79 + + .section seg,sect122 + .globl _var122 ; @var122 + .p2align 2, 0x0 +_var122: + .long 122 ; 0x7a + + .section seg,sect123 + .globl _var123 ; @var123 + .p2align 2, 0x0 +_var123: + .long 123 ; 0x7b + + .section seg,sect124 + .globl _var124 ; @var124 + .p2align 2, 0x0 +_var124: + .long 124 ; 0x7c + + .section seg,sect125 + .globl _var125 ; @var125 + .p2align 2, 0x0 +_var125: + .long 125 ; 0x7d + + .section seg,sect126 + .globl _var126 ; @var126 + .p2align 2, 0x0 +_var126: + .long 126 ; 0x7e + + .section seg,sect127 + .globl _var127 ; @var127 + .p2align 2, 0x0 +_var127: + .long 127 ; 0x7f + + .section seg,sect128 + .globl _var128 ; @var128 + .p2align 2, 0x0 +_var128: + .long 128 ; 0x80 + + .section seg,sect129 + .globl _var129 ; @var129 + .p2align 2, 0x0 +_var129: + .long 129 ; 0x81 + + .section seg,sect130 + .globl _var130 ; @var130 + .p2align 2, 0x0 +_var130: + .long 130 ; 0x82 + + .section seg,sect131 + .globl _var131 ; @var131 + .p2align 2, 0x0 +_var131: + .long 131 ; 0x83 + + .section seg,sect132 + .globl _var132 ; @var132 + .p2align 2, 0x0 +_var132: + .long 132 ; 0x84 + + .section seg,sect133 + .globl _var133 ; @var133 + .p2align 2, 0x0 +_var133: + .long 133 ; 0x85 + + .section seg,sect134 + .globl _var134 ; @var134 + .p2align 2, 0x0 +_var134: + .long 134 ; 0x86 + + .section seg,sect135 + .globl _var135 ; @var135 + .p2align 2, 0x0 +_var135: + .long 135 ; 0x87 + + .section seg,sect136 + .globl _var136 ; @var136 + .p2align 2, 0x0 +_var136: + .long 136 ; 0x88 + + .section seg,sect137 + .globl _var137 ; @var137 + .p2align 2, 0x0 +_var137: + .long 137 ; 0x89 + + .section seg,sect138 + .globl _var138 ; @var138 + .p2align 2, 0x0 +_var138: + .long 138 ; 0x8a + + .section seg,sect139 + .globl _var139 ; @var139 + .p2align 2, 0x0 +_var139: + .long 139 ; 0x8b + + .section seg,sect140 + .globl _var140 ; @var140 + .p2align 2, 0x0 +_var140: + .long 140 ; 0x8c + + .section seg,sect141 + .globl _var141 ; @var141 + .p2align 2, 0x0 +_var141: + .long 141 ; 0x8d + + .section seg,sect142 + .globl _var142 ; @var142 + .p2align 2, 0x0 +_var142: + .long 142 ; 0x8e + + .section seg,sect143 + .globl _var143 ; @var143 + .p2align 2, 0x0 +_var143: + .long 143 ; 0x8f + + .section seg,sect144 + .globl _var144 ; @var144 + .p2align 2, 0x0 +_var144: + .long 144 ; 0x90 + + .section seg,sect145 + .globl _var145 ; @var145 + .p2align 2, 0x0 +_var145: + .long 145 ; 0x91 + + .section seg,sect146 + .globl _var146 ; @var146 + .p2align 2, 0x0 +_var146: + .long 146 ; 0x92 + + .section seg,sect147 + .globl _var147 ; @var147 + .p2align 2, 0x0 +_var147: + .long 147 ; 0x93 + + .section seg,sect148 + .globl _var148 ; @var148 + .p2align 2, 0x0 +_var148: + .long 148 ; 0x94 + + .section seg,sect149 + .globl _var149 ; @var149 + .p2align 2, 0x0 +_var149: + .long 149 ; 0x95 + + .section seg,sect150 + .globl _var150 ; @var150 + .p2align 2, 0x0 +_var150: + .long 150 ; 0x96 + + .section seg,sect151 + .globl _var151 ; @var151 + .p2align 2, 0x0 +_var151: + .long 151 ; 0x97 + + .section seg,sect152 + .globl _var152 ; @var152 + .p2align 2, 0x0 +_var152: + .long 152 ; 0x98 + + .section seg,sect153 + .globl _var153 ; @var153 + .p2align 2, 0x0 +_var153: + .long 153 ; 0x99 + + .section seg,sect154 + .globl _var154 ; @var154 + .p2align 2, 0x0 +_var154: + .long 154 ; 0x9a + + .section seg,sect155 + .globl _var155 ; @var155 + .p2align 2, 0x0 +_var155: + .long 155 ; 0x9b + + .section seg,sect156 + .globl _var156 ; @var156 + .p2align 2, 0x0 +_var156: + .long 156 ; 0x9c + + .section seg,sect157 + .globl _var157 ; @var157 + .p2align 2, 0x0 +_var157: + .long 157 ; 0x9d + + .section seg,sect158 + .globl _var158 ; @var158 + .p2align 2, 0x0 +_var158: + .long 158 ; 0x9e + + .section seg,sect159 + .globl _var159 ; @var159 + .p2align 2, 0x0 +_var159: + .long 159 ; 0x9f + + .section seg,sect160 + .globl _var160 ; @var160 + .p2align 2, 0x0 +_var160: + .long 160 ; 0xa0 + + .section seg,sect161 + .globl _var161 ; @var161 + .p2align 2, 0x0 +_var161: + .long 161 ; 0xa1 + + .section seg,sect162 + .globl _var162 ; @var162 + .p2align 2, 0x0 +_var162: + .long 162 ; 0xa2 + + .section seg,sect163 + .globl _var163 ; @var163 + .p2align 2, 0x0 +_var163: + .long 163 ; 0xa3 + + .section seg,sect164 + .globl _var164 ; @var164 + .p2align 2, 0x0 +_var164: + .long 164 ; 0xa4 + + .section seg,sect165 + .globl _var165 ; @var165 + .p2align 2, 0x0 +_var165: + .long 165 ; 0xa5 + + .section seg,sect166 + .globl _var166 ; @var166 + .p2align 2, 0x0 +_var166: + .long 166 ; 0xa6 + + .section seg,sect167 + .globl _var167 ; @var167 + .p2align 2, 0x0 +_var167: + .long 167 ; 0xa7 + + .section seg,sect168 + .globl _var168 ; @var168 + .p2align 2, 0x0 +_var168: + .long 168 ; 0xa8 + + .section seg,sect169 + .globl _var169 ; @var169 + .p2align 2, 0x0 +_var169: + .long 169 ; 0xa9 + + .section seg,sect170 + .globl _var170 ; @var170 + .p2align 2, 0x0 +_var170: + .long 170 ; 0xaa + + .section seg,sect171 + .globl _var171 ; @var171 + .p2align 2, 0x0 +_var171: + .long 171 ; 0xab + + .section seg,sect172 + .globl _var172 ; @var172 + .p2align 2, 0x0 +_var172: + .long 172 ; 0xac + + .section seg,sect173 + .globl _var173 ; @var173 + .p2align 2, 0x0 +_var173: + .long 173 ; 0xad + + .section seg,sect174 + .globl _var174 ; @var174 + .p2align 2, 0x0 +_var174: + .long 174 ; 0xae + + .section seg,sect175 + .globl _var175 ; @var175 + .p2align 2, 0x0 +_var175: + .long 175 ; 0xaf + + .section seg,sect176 + .globl _var176 ; @var176 + .p2align 2, 0x0 +_var176: + .long 176 ; 0xb0 + + .section seg,sect177 + .globl _var177 ; @var177 + .p2align 2, 0x0 +_var177: + .long 177 ; 0xb1 + + .section seg,sect178 + .globl _var178 ; @var178 + .p2align 2, 0x0 +_var178: + .long 178 ; 0xb2 + + .section seg,sect179 + .globl _var179 ; @var179 + .p2align 2, 0x0 +_var179: + .long 179 ; 0xb3 + + .section seg,sect180 + .globl _var180 ; @var180 + .p2align 2, 0x0 +_var180: + .long 180 ; 0xb4 + + .section seg,sect181 + .globl _var181 ; @var181 + .p2align 2, 0x0 +_var181: + .long 181 ; 0xb5 + + .section seg,sect182 + .globl _var182 ; @var182 + .p2align 2, 0x0 +_var182: + .long 182 ; 0xb6 + + .section seg,sect183 + .globl _var183 ; @var183 + .p2align 2, 0x0 +_var183: + .long 183 ; 0xb7 + + .section seg,sect184 + .globl _var184 ; @var184 + .p2align 2, 0x0 +_var184: + .long 184 ; 0xb8 + + .section seg,sect185 + .globl _var185 ; @var185 + .p2align 2, 0x0 +_var185: + .long 185 ; 0xb9 + + .section seg,sect186 + .globl _var186 ; @var186 + .p2align 2, 0x0 +_var186: + .long 186 ; 0xba + + .section seg,sect187 + .globl _var187 ; @var187 + .p2align 2, 0x0 +_var187: + .long 187 ; 0xbb + + .section seg,sect188 + .globl _var188 ; @var188 + .p2align 2, 0x0 +_var188: + .long 188 ; 0xbc + + .section seg,sect189 + .globl _var189 ; @var189 + .p2align 2, 0x0 +_var189: + .long 189 ; 0xbd + + .section seg,sect190 + .globl _var190 ; @var190 + .p2align 2, 0x0 +_var190: + .long 190 ; 0xbe + + .section seg,sect191 + .globl _var191 ; @var191 + .p2align 2, 0x0 +_var191: + .long 191 ; 0xbf + + .section seg,sect192 + .globl _var192 ; @var192 + .p2align 2, 0x0 +_var192: + .long 192 ; 0xc0 + + .section seg,sect193 + .globl _var193 ; @var193 + .p2align 2, 0x0 +_var193: + .long 193 ; 0xc1 + + .section seg,sect194 + .globl _var194 ; @var194 + .p2align 2, 0x0 +_var194: + .long 194 ; 0xc2 + + .section seg,sect195 + .globl _var195 ; @var195 + .p2align 2, 0x0 +_var195: + .long 195 ; 0xc3 + + .section seg,sect196 + .globl _var196 ; @var196 + .p2align 2, 0x0 +_var196: + .long 196 ; 0xc4 + + .section seg,sect197 + .globl _var197 ; @var197 + .p2align 2, 0x0 +_var197: + .long 197 ; 0xc5 + + .section seg,sect198 + .globl _var198 ; @var198 + .p2align 2, 0x0 +_var198: + .long 198 ; 0xc6 + + .section seg,sect199 + .globl _var199 ; @var199 + .p2align 2, 0x0 +_var199: + .long 199 ; 0xc7 + + .section seg,sect200 + .globl _var200 ; @var200 + .p2align 2, 0x0 +_var200: + .long 200 ; 0xc8 + + .section seg,sect201 + .globl _var201 ; @var201 + .p2align 2, 0x0 +_var201: + .long 201 ; 0xc9 + + .section seg,sect202 + .globl _var202 ; @var202 + .p2align 2, 0x0 +_var202: + .long 202 ; 0xca + + .section seg,sect203 + .globl _var203 ; @var203 + .p2align 2, 0x0 +_var203: + .long 203 ; 0xcb + + .section seg,sect204 + .globl _var204 ; @var204 + .p2align 2, 0x0 +_var204: + .long 204 ; 0xcc + + .section seg,sect205 + .globl _var205 ; @var205 + .p2align 2, 0x0 +_var205: + .long 205 ; 0xcd + + .section seg,sect206 + .globl _var206 ; @var206 + .p2align 2, 0x0 +_var206: + .long 206 ; 0xce + + .section seg,sect207 + .globl _var207 ; @var207 + .p2align 2, 0x0 +_var207: + .long 207 ; 0xcf + + .section seg,sect208 + .globl _var208 ; @var208 + .p2align 2, 0x0 +_var208: + .long 208 ; 0xd0 + + .section seg,sect209 + .globl _var209 ; @var209 + .p2align 2, 0x0 +_var209: + .long 209 ; 0xd1 + + .section seg,sect210 + .globl _var210 ; @var210 + .p2align 2, 0x0 +_var210: + .long 210 ; 0xd2 + + .section seg,sect211 + .globl _var211 ; @var211 + .p2align 2, 0x0 +_var211: + .long 211 ; 0xd3 + + .section seg,sect212 + .globl _var212 ; @var212 + .p2align 2, 0x0 +_var212: + .long 212 ; 0xd4 + + .section seg,sect213 + .globl _var213 ; @var213 + .p2align 2, 0x0 +_var213: + .long 213 ; 0xd5 + + .section seg,sect214 + .globl _var214 ; @var214 + .p2align 2, 0x0 +_var214: + .long 214 ; 0xd6 + + .section seg,sect215 + .globl _var215 ; @var215 + .p2align 2, 0x0 +_var215: + .long 215 ; 0xd7 + + .section seg,sect216 + .globl _var216 ; @var216 + .p2align 2, 0x0 +_var216: + .long 216 ; 0xd8 + + .section seg,sect217 + .globl _var217 ; @var217 + .p2align 2, 0x0 +_var217: + .long 217 ; 0xd9 + + .section seg,sect218 + .globl _var218 ; @var218 + .p2align 2, 0x0 +_var218: + .long 218 ; 0xda + + .section seg,sect219 + .globl _var219 ; @var219 + .p2align 2, 0x0 +_var219: + .long 219 ; 0xdb + + .section seg,sect220 + .globl _var220 ; @var220 + .p2align 2, 0x0 +_var220: + .long 220 ; 0xdc + + .section seg,sect221 + .globl _var221 ; @var221 + .p2align 2, 0x0 +_var221: + .long 221 ; 0xdd + + .section seg,sect222 + .globl _var222 ; @var222 + .p2align 2, 0x0 +_var222: + .long 222 ; 0xde + + .section seg,sect223 + .globl _var223 ; @var223 + .p2align 2, 0x0 +_var223: + .long 223 ; 0xdf + + .section seg,sect224 + .globl _var224 ; @var224 + .p2align 2, 0x0 +_var224: + .long 224 ; 0xe0 + + .section seg,sect225 + .globl _var225 ; @var225 + .p2align 2, 0x0 +_var225: + .long 225 ; 0xe1 + + .section seg,sect226 + .globl _var226 ; @var226 + .p2align 2, 0x0 +_var226: + .long 226 ; 0xe2 + + .section seg,sect227 + .globl _var227 ; @var227 + .p2align 2, 0x0 +_var227: + .long 227 ; 0xe3 + + .section seg,sect228 + .globl _var228 ; @var228 + .p2align 2, 0x0 +_var228: + .long 228 ; 0xe4 + + .section seg,sect229 + .globl _var229 ; @var229 + .p2align 2, 0x0 +_var229: + .long 229 ; 0xe5 + + .section seg,sect230 + .globl _var230 ; @var230 + .p2align 2, 0x0 +_var230: + .long 230 ; 0xe6 + + .section seg,sect231 + .globl _var231 ; @var231 + .p2align 2, 0x0 +_var231: + .long 231 ; 0xe7 + + .section seg,sect232 + .globl _var232 ; @var232 + .p2align 2, 0x0 +_var232: + .long 232 ; 0xe8 + + .section seg,sect233 + .globl _var233 ; @var233 + .p2align 2, 0x0 +_var233: + .long 233 ; 0xe9 + + .section seg,sect234 + .globl _var234 ; @var234 + .p2align 2, 0x0 +_var234: + .long 234 ; 0xea + + .section seg,sect235 + .globl _var235 ; @var235 + .p2align 2, 0x0 +_var235: + .long 235 ; 0xeb + + .section seg,sect236 + .globl _var236 ; @var236 + .p2align 2, 0x0 +_var236: + .long 236 ; 0xec + + .section seg,sect237 + .globl _var237 ; @var237 + .p2align 2, 0x0 +_var237: + .long 237 ; 0xed + + .section seg,sect238 + .globl _var238 ; @var238 + .p2align 2, 0x0 +_var238: + .long 238 ; 0xee + + .section seg,sect239 + .globl _var239 ; @var239 + .p2align 2, 0x0 +_var239: + .long 239 ; 0xef + + .section seg,sect240 + .globl _var240 ; @var240 + .p2align 2, 0x0 +_var240: + .long 240 ; 0xf0 + + .section seg,sect241 + .globl _var241 ; @var241 + .p2align 2, 0x0 +_var241: + .long 241 ; 0xf1 + + .section seg,sect242 + .globl _var242 ; @var242 + .p2align 2, 0x0 +_var242: + .long 242 ; 0xf2 + + .section seg,sect243 + .globl _var243 ; @var243 + .p2align 2, 0x0 +_var243: + .long 243 ; 0xf3 + + .section seg,sect244 + .globl _var244 ; @var244 + .p2align 2, 0x0 +_var244: + .long 244 ; 0xf4 + + .section seg,sect245 + .globl _var245 ; @var245 + .p2align 2, 0x0 +_var245: + .long 245 ; 0xf5 + + .section seg,sect246 + .globl _var246 ; @var246 + .p2align 2, 0x0 +_var246: + .long 246 ; 0xf6 + + .section seg,sect247 + .globl _var247 ; @var247 + .p2align 2, 0x0 +_var247: + .long 247 ; 0xf7 + + .section seg,sect248 + .globl _var248 ; @var248 + .p2align 2, 0x0 +_var248: + .long 248 ; 0xf8 + + .section seg,sect249 + .globl _var249 ; @var249 + .p2align 2, 0x0 +_var249: + .long 249 ; 0xf9 + + .section seg,sect250 + .globl _var250 ; @var250 + .p2align 2, 0x0 +_var250: + .long 250 ; 0xfa + + .section seg,sect251 + .globl _var251 ; @var251 + .p2align 2, 0x0 +_var251: + .long 251 ; 0xfb + + .section seg,sect252 + .globl _var252 ; @var252 + .p2align 2, 0x0 +_var252: + .long 252 ; 0xfc + + .section seg,sect253 + .globl _var253 ; @var253 + .p2align 2, 0x0 +_var253: + .long 253 ; 0xfd + + .section seg,sect254 + .globl _var254 ; @var254 + .p2align 2, 0x0 +_var254: + .long 254 ; 0xfe + + .section seg,sect255 + .globl _var255 ; @var255 + .p2align 2, 0x0 +_var255: + .long 255 ; 0xff + + .section seg,sect256 + .globl _var256 ; @var256 + .p2align 2, 0x0 +_var256: + .long 256 ; 0x100 + + .section seg,sect257 + .globl _var257 ; @var257 + .p2align 2, 0x0 +_var257: + .long 257 ; 0x101 + +.subsections_via_symbols From abb8c4ba60f6512b63eba1cfc5723248a2e8d635 Mon Sep 17 00:00:00 2001 From: Prabhu Rajasekaran Date: Tue, 11 Nov 2025 14:27:03 -0800 Subject: [PATCH 58/64] [lld][macho] Fix segfault while processing malformed object file. (#167025) Ran into a use case where we had a MachO object file with a section symbol which did not have a section associated with it segfaults during linking. This patch aims to handle such cases gracefully and avoid the linker from crashing. --------- Co-authored-by: Ellis Hoag --- lld/MachO/InputFiles.cpp | 11 ++ ...dle-invalid-section-reference-too-big.test | 128 ++++++++++++++++++ ...handle-invalid-section-reference-zero.test | 128 ++++++++++++++++++ 3 files changed, 267 insertions(+) create mode 100644 lld/test/MachO/handle-invalid-section-reference-too-big.test create mode 100644 lld/test/MachO/handle-invalid-section-reference-zero.test diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp index 20e4a1d755229..d0128d03a9eab 100644 --- a/lld/MachO/InputFiles.cpp +++ b/lld/MachO/InputFiles.cpp @@ -808,6 +808,17 @@ void ObjFile::parseSymbols(ArrayRef sectionHeaders, continue; if ((sym.n_type & N_TYPE) == N_SECT) { + if (sym.n_sect == 0) { + fatal("section symbol " + StringRef(strtab + sym.n_strx) + " in " + + toString(this) + " has an invalid section index [0]"); + } + if (sym.n_sect > sections.size()) { + fatal("section symbol " + StringRef(strtab + sym.n_strx) + " in " + + toString(this) + " has an invalid section index [" + + Twine(static_cast(sym.n_sect)) + + "] greater than the total number of sections [" + + Twine(sections.size()) + "]"); + } Subsections &subsections = sections[sym.n_sect - 1]->subsections; // parseSections() may have chosen not to parse this section. if (subsections.empty()) diff --git a/lld/test/MachO/handle-invalid-section-reference-too-big.test b/lld/test/MachO/handle-invalid-section-reference-too-big.test new file mode 100644 index 0000000000000..1642d63e50af4 --- /dev/null +++ b/lld/test/MachO/handle-invalid-section-reference-too-big.test @@ -0,0 +1,128 @@ +# REQUIRES: aarch64 + +## This is a regression test which makes sure that when there is an invalid section index +## associated with a section symbol, the linker does not segfault. + +## Test YAML content was created using the following steps +## 1. Create an object file from the following assembly +## `llvm-mc -filetype=obj -triple=arm64-apple-darwin symbol.s -o symbol.o` +## +## .text +## .section __TEST,__mystuff +## .globl _mysec +## _mysec: +## .byte 0xC3 +## +## 2. Use obj2yaml to convert object file to yaml +## `obj2yaml symbol.o -o symbol.yaml` +## +## 3. Manually set n_sect value of ltmp1 symbol to 10 which is greater than the number of sections 2. +## + +# RUN: yaml2obj %s -o %t +# RUN: not %lld -platform_version macos 10.14 11.0 -arch arm64 %t 2>&1 | FileCheck %s --check-prefix=FATAL + +# FATAL: error: section symbol ltmp0 in {{.*}} has an invalid section index [10] greater than the total number of sections [2] + +--- !mach-o +FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x0 + filetype: 0x1 + ncmds: 3 + sizeofcmds: 336 + flags: 0x0 + reserved: 0x0 +LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 232 + segname: '' + vmaddr: 0 + vmsize: 1 + fileoff: 368 + filesize: 1 + maxprot: 7 + initprot: 7 + nsects: 2 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x0 + size: 0 + offset: 0x170 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x80000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: '' + - sectname: __mystuff + segname: __TEST + addr: 0x0 + size: 1 + offset: 0x170 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x0 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: C3 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 376 + nsyms: 3 + stroff: 424 + strsize: 24 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 2 + iextdefsym: 2 + nextdefsym: 1 + iundefsym: 3 + nundefsym: 0 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 0 + nindirectsyms: 0 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 +LinkEditData: + NameList: + - n_strx: 14 + n_type: 0xE + n_sect: 10 + n_desc: 0 + n_value: 0 + - n_strx: 8 + n_type: 0xE + n_sect: 2 + n_desc: 0 + n_value: 0 + - n_strx: 1 + n_type: 0xF + n_sect: 2 + n_desc: 0 + n_value: 0 + StringTable: + - '' + - _mysec + - ltmp1 + - ltmp0 + - '' + - '' + - '' + - '' +... diff --git a/lld/test/MachO/handle-invalid-section-reference-zero.test b/lld/test/MachO/handle-invalid-section-reference-zero.test new file mode 100644 index 0000000000000..ab636705198e5 --- /dev/null +++ b/lld/test/MachO/handle-invalid-section-reference-zero.test @@ -0,0 +1,128 @@ +# REQUIRES: aarch64 + +## This is a regression test which makes sure that when there is an invalid section index +## associated with a section symbol, the linker does not segfault. + +## Test YAML content was created using the following steps +## 1. Create an object file from the following assembly +## `llvm-mc -filetype=obj -triple=arm64-apple-darwin symbol.s -o symbol.o` +## +## .text +## .section __TEST,__mystuff +## .globl _mysec +## _mysec: +## .byte 0xC3 +## +## 2. Use obj2yaml to convert object file to yaml +## `obj2yaml symbol.o -o symbol.yaml` +## +## 3. Manually set n_sect value of ltmp1 symbol to 0 instead of 1. +## + +# RUN: yaml2obj %s -o %t +# RUN: not %lld -platform_version macos 10.14 11.0 -arch arm64 %t 2>&1 | FileCheck %s --check-prefix=FATAL + +# FATAL: error: section symbol ltmp0 in {{.*}} has an invalid section index [0] + +--- !mach-o +FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x0 + filetype: 0x1 + ncmds: 3 + sizeofcmds: 336 + flags: 0x0 + reserved: 0x0 +LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 232 + segname: '' + vmaddr: 0 + vmsize: 1 + fileoff: 368 + filesize: 1 + maxprot: 7 + initprot: 7 + nsects: 2 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x0 + size: 0 + offset: 0x170 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x80000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: '' + - sectname: __mystuff + segname: __TEST + addr: 0x0 + size: 1 + offset: 0x170 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x0 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: C3 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 376 + nsyms: 3 + stroff: 424 + strsize: 24 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 2 + iextdefsym: 2 + nextdefsym: 1 + iundefsym: 3 + nundefsym: 0 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 0 + nindirectsyms: 0 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 +LinkEditData: + NameList: + - n_strx: 14 + n_type: 0xE + n_sect: 0 + n_desc: 0 + n_value: 0 + - n_strx: 8 + n_type: 0xE + n_sect: 2 + n_desc: 0 + n_value: 0 + - n_strx: 1 + n_type: 0xF + n_sect: 2 + n_desc: 0 + n_value: 0 + StringTable: + - '' + - _mysec + - ltmp1 + - ltmp0 + - '' + - '' + - '' + - '' +... From dc0ccbdfc71f73c84d5d0e224d81a0bb9536bad5 Mon Sep 17 00:00:00 2001 From: Alan Zhao Date: Tue, 11 Nov 2025 14:44:55 -0800 Subject: [PATCH 59/64] [compiler-rt][asan] Fix a test on Windows (#167591) Windows doesn't support `pthread_attr`, which was introduced to asan_test.cpp in #165198, so this change `#ifdef`s out the changes made in that PR. Originally reported by Chrome as https://crbug.com/459880605. --- compiler-rt/lib/asan/tests/asan_test.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/compiler-rt/lib/asan/tests/asan_test.cpp b/compiler-rt/lib/asan/tests/asan_test.cpp index 59d64ac4753ca..06c9fdc9b23db 100644 --- a/compiler-rt/lib/asan/tests/asan_test.cpp +++ b/compiler-rt/lib/asan/tests/asan_test.cpp @@ -1121,6 +1121,8 @@ TEST(AddressSanitizer, StressStackReuseTest) { TEST(AddressSanitizer, ThreadedStressStackReuseTest) { const int kNumThreads = 20; pthread_t t[kNumThreads]; +// pthread_attr isn't supported on Windows. +#ifndef _WIN32 size_t curStackSize = 0; pthread_attr_t attr; pthread_attr_init(&attr); @@ -1130,13 +1132,20 @@ TEST(AddressSanitizer, ThreadedStressStackReuseTest) { int rc = pthread_attr_setstacksize(&attr, MIN_STACK_SIZE); ASSERT_EQ(0, rc); } +#endif for (int i = 0; i < kNumThreads; i++) { +#ifdef _WIN32 + PTHREAD_CREATE(&t[i], 0, (void* (*)(void* x))LotsOfStackReuse, 0); +#else PTHREAD_CREATE(&t[i], &attr, (void* (*)(void* x))LotsOfStackReuse, 0); +#endif } for (int i = 0; i < kNumThreads; i++) { PTHREAD_JOIN(t[i], 0); } +#ifndef _WIN32 pthread_attr_destroy(&attr); +#endif } // pthread_exit tries to perform unwinding stuff that leads to dlopen'ing From bbf62dc5441282dd55bcb72406d765ea5855d6f2 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 11 Nov 2025 14:50:58 -0800 Subject: [PATCH 60/64] AArch64: Use TargetConstant for intrinsic IDs (#166661) These should always use TargetConstant --- .../Target/AArch64/AArch64ISelLowering.cpp | 88 ++++++++++--------- 1 file changed, 47 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8457f6178fdc2..3c6679f3c1fa7 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -5555,9 +5555,10 @@ SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op, SDLoc DL(Op); SDValue Chain = Op.getOperand(0); - SDValue FPCR_64 = DAG.getNode( - ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, - {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)}); + SDValue FPCR_64 = + DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, + {Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, + MVT::i64)}); Chain = FPCR_64.getValue(1); SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR_64); SDValue FltRounds = DAG.getNode(ISD::ADD, DL, MVT::i32, FPCR_32, @@ -5643,7 +5644,8 @@ SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op, // Set new value of FPCR. SDValue Ops2[] = { - Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR}; + Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), + FPCR}; return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2); } @@ -5666,9 +5668,9 @@ SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op, DAG.getConstant(AArch64::ReservedFPControlBits, DL, MVT::i64)); // Set new value of FPCR. - SDValue Ops2[] = {Chain, - DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), - FPSCRMasked}; + SDValue Ops2[] = { + Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), + FPSCRMasked}; return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2); } @@ -7300,17 +7302,19 @@ SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op, SDValue Compressed = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, Vec.getValueType(), - DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec); + DAG.getTargetConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, + Vec); // compact fills with 0s, so if our passthru is all 0s, do nothing here. if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) { SDValue Offset = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, - DAG.getConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, Mask); + DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, + Mask); SDValue IndexMask = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MaskVT, - DAG.getConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64), + DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64), DAG.getConstant(0, DL, MVT::i64), Offset); Compressed = @@ -7439,10 +7443,10 @@ static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) { DAG.getUNDEF(ExpVT), Exp, Zero); SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1), AArch64SVEPredPattern::all); - SDValue FScale = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, XVT, - DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64), - VPg, VX, VExp); + SDValue FScale = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, XVT, + DAG.getTargetConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64), VPg, + VX, VExp); SDValue Final = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero); if (X.getValueType() != XScalarTy) @@ -8106,7 +8110,7 @@ static SDValue emitRestoreZALazySave(SDValue Chain, SDLoc DL, TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout())); SDValue TPIDR2_EL0 = DAG.getNode( ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Chain, - DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); + DAG.getTargetConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); // Copy the address of the TPIDR2 block into X0 before 'calling' the // RESTORE_ZA pseudo. SDValue Glue; @@ -8121,7 +8125,7 @@ static SDValue emitRestoreZALazySave(SDValue Chain, SDLoc DL, // Finally reset the TPIDR2_EL0 register to 0. Chain = DAG.getNode( ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, - DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), + DAG.getTargetConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i64)); TPIDR2.Uses++; return Chain; @@ -8716,7 +8720,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( if (Attrs.isNewZT0()) Chain = DAG.getNode( ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, - DAG.getConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32), + DAG.getTargetConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32), DAG.getTargetConstant(0, DL, MVT::i32)); } @@ -9529,7 +9533,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); Chain = DAG.getNode( ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, - DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), + DAG.getTargetConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), TPIDR2ObjAddr); OptimizationRemarkEmitter ORE(&MF.getFunction()); ORE.emit([&]() { @@ -13421,8 +13425,8 @@ SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) { return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), SourceVec, - MaskSourceVec); + DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), + SourceVec, MaskSourceVec); } // Gather data to see if the operation can be modelled as a @@ -14278,14 +14282,16 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef ShuffleMask, V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, - DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, + DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), + V1Cst, DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen))); } else { if (IndexLen == 8) { V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, - DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, + DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), + V1Cst, DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen))); } else { // FIXME: We cannot, for the moment, emit a TBL2 instruction because we @@ -14296,8 +14302,8 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef ShuffleMask, // IndexLen)); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, - DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst, - V2Cst, + DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), + V1Cst, V2Cst, DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen))); } } @@ -16450,10 +16456,10 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), DAG.getTargetConstant(Cnt, DL, MVT::i32)); - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL, - MVT::i32), - Op.getOperand(0), Op.getOperand(1)); + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getTargetConstant(Intrinsic::aarch64_neon_ushl, DL, MVT::i32), + Op.getOperand(0), Op.getOperand(1)); case ISD::SRA: case ISD::SRL: if (VT.isScalableVector() && @@ -20049,7 +20055,7 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, : Intrinsic::aarch64_neon_vcvtfp2fxu; SDValue FixConv = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy, - DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), + DAG.getTargetConstant(IntrinsicOpcode, DL, MVT::i32), Op->getOperand(0), DAG.getTargetConstant(C, DL, MVT::i32)); // We can handle smaller integers by generating an extra trunc. if (IntBits < FloatBits) @@ -27338,8 +27344,8 @@ static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, // ...and remap the intrinsic `aarch64_sve_prf_gather_scalar_offset` to // `aarch64_sve_prfb_gather_uxtw_index`. SDLoc DL(N); - Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL, - MVT::i64); + Ops[1] = DAG.getTargetConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, + DL, MVT::i64); return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); } @@ -31204,10 +31210,10 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, SDValue Shuffle; if (IsSingleOp) - Shuffle = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, - DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32), - Op1, SVEMask); + Shuffle = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, + DAG.getTargetConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32), Op1, + SVEMask); else if (Subtarget.hasSVE2()) { if (!MinMaxEqual) { unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt; @@ -31226,10 +31232,10 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, SVEMask = convertToScalableVector( DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask); } - Shuffle = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, - DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32), - Op1, Op2, SVEMask); + Shuffle = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, + DAG.getTargetConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32), Op1, + Op2, SVEMask); } Shuffle = convertFromScalableVector(DAG, VT, Shuffle); return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); @@ -31389,8 +31395,8 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( unsigned SegmentElts = VT.getVectorNumElements() / Segments; if (std::optional Lane = isDUPQMask(ShuffleMask, Segments, SegmentElts)) { - SDValue IID = - DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64); + SDValue IID = DAG.getTargetConstant(Intrinsic::aarch64_sve_dup_laneq, + DL, MVT::i64); return convertFromScalableVector( DAG, VT, DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, From b612b10c9c02cca602820f1b525d0f5c9533ad77 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 11 Nov 2025 22:57:18 +0000 Subject: [PATCH 61/64] [VPlan] Add tests for hoisting predicated loads. Adds test coverage with loops where the same loads get executed under complementary predicates and can be hoisted, together with a set of negative test cases. --- ...predicated-loads-with-predicated-stores.ll | 761 ++++++++++ .../LoopVectorize/hoist-predicated-loads.ll | 1291 +++++++++++++++++ 2 files changed, 2052 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll create mode 100644 llvm/test/Transforms/LoopVectorize/hoist-predicated-loads.ll diff --git a/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll new file mode 100644 index 0000000000000..d447a39aafd93 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll @@ -0,0 +1,761 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6 +; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s + +define void @test_stores_noalias_via_rt_checks_after_loads(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @test_stores_noalias_via_rt_checks_after_loads( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE17:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !alias.scope [[META3:![0-9]+]] +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]] +; CHECK: [[PRED_LOAD_IF6]]: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !alias.scope [[META3]] +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP16]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE7]] +; CHECK: [[PRED_LOAD_CONTINUE7]]: +; CHECK-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF6]] ] +; CHECK-NEXT: [[TMP19:%.*]] = sub <2 x i32> [[TMP18]], splat (i32 5) +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[TMP19]], i32 0 +; CHECK-NEXT: store i32 [[TMP22]], ptr [[TMP21]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]] +; CHECK: [[PRED_STORE_IF8]]: +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i32> [[TMP19]], i32 1 +; CHECK-NEXT: store i32 [[TMP25]], ptr [[TMP24]], align 4, !alias.scope [[META5]], !noalias [[META7]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE9]] +; CHECK: [[PRED_STORE_CONTINUE9]]: +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP26]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11:.*]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4, !alias.scope [[META3]] +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <2 x i32> poison, i32 [[TMP28]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP30:%.*]] = phi <2 x i32> [ poison, %[[PRED_STORE_CONTINUE9]] ], [ [[TMP29]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP31]], label %[[PRED_LOAD_IF12:.*]], label %[[PRED_LOAD_CONTINUE13:.*]] +; CHECK: [[PRED_LOAD_IF12]]: +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4, !alias.scope [[META3]] +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP33]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE13]] +; CHECK: [[PRED_LOAD_CONTINUE13]]: +; CHECK-NEXT: [[TMP35:%.*]] = phi <2 x i32> [ [[TMP30]], %[[PRED_LOAD_CONTINUE11]] ], [ [[TMP34]], %[[PRED_LOAD_IF12]] ] +; CHECK-NEXT: [[TMP36:%.*]] = add <2 x i32> [[TMP35]], splat (i32 10) +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP37]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]] +; CHECK: [[PRED_STORE_IF14]]: +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[TMP36]], i32 0 +; CHECK-NEXT: store i32 [[TMP39]], ptr [[TMP38]], align 4, !alias.scope [[META5]], !noalias [[META7]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE15]] +; CHECK: [[PRED_STORE_CONTINUE15]]: +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP40]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17]] +; CHECK: [[PRED_STORE_IF16]]: +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[TMP36]], i32 1 +; CHECK-NEXT: store i32 [[TMP42]], ptr [[TMP41]], align 4, !alias.scope [[META5]], !noalias [[META7]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE17]] +; CHECK: [[PRED_STORE_CONTINUE17]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP43]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %c = load i32, ptr %gep.cond, align 4 + %c.0 = icmp ule i32 %c, 11 + br i1 %c.0, label %then, label %else + +then: + %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.then = load i32, ptr %gep.src.then, align 4 + %add = add i32 %l.src.then, 10 + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %add, ptr %gep.dst, align 4 + br label %loop.latch + +else: + %gep.src.else= getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.else = load i32, ptr %gep.src.else, align 4 + %sub = sub i32 %l.src.else, 5 + %gep.dst.else = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %sub, ptr %gep.dst.else, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_aliasing_store(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @test_aliasing_store( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: [[BOUND06:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND17:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT8:%.*]] = and i1 [[BOUND06]], [[BOUND17]] +; CHECK-NEXT: [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT8]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX9]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE21:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META12:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: store i32 99, ptr [[TMP10]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META17:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !alias.scope [[META15]], !noalias [[META17]] +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11:.*]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: store i32 99, ptr [[TMP15]], align 4, !alias.scope [[META15]], !noalias [[META17]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !alias.scope [[META15]], !noalias [[META17]] +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP16]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[TMP19:%.*]] = sub <2 x i32> [[TMP18]], splat (i32 5) +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[TMP19]], i32 0 +; CHECK-NEXT: store i32 [[TMP22]], ptr [[TMP21]], align 4, !alias.scope [[META19:![0-9]+]], !noalias [[META12]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]] +; CHECK: [[PRED_STORE_IF12]]: +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i32> [[TMP19]], i32 1 +; CHECK-NEXT: store i32 [[TMP25]], ptr [[TMP24]], align 4, !alias.scope [[META19]], !noalias [[META12]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE13]] +; CHECK: [[PRED_STORE_CONTINUE13]]: +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP26]], label %[[PRED_LOAD_IF14:.*]], label %[[PRED_LOAD_CONTINUE15:.*]] +; CHECK: [[PRED_LOAD_IF14]]: +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4, !alias.scope [[META15]], !noalias [[META17]] +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <2 x i32> poison, i32 [[TMP28]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE15]] +; CHECK: [[PRED_LOAD_CONTINUE15]]: +; CHECK-NEXT: [[TMP30:%.*]] = phi <2 x i32> [ poison, %[[PRED_STORE_CONTINUE13]] ], [ [[TMP29]], %[[PRED_LOAD_IF14]] ] +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP31]], label %[[PRED_LOAD_IF16:.*]], label %[[PRED_LOAD_CONTINUE17:.*]] +; CHECK: [[PRED_LOAD_IF16]]: +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4, !alias.scope [[META15]], !noalias [[META17]] +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP33]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE17]] +; CHECK: [[PRED_LOAD_CONTINUE17]]: +; CHECK-NEXT: [[TMP35:%.*]] = phi <2 x i32> [ [[TMP30]], %[[PRED_LOAD_CONTINUE15]] ], [ [[TMP34]], %[[PRED_LOAD_IF16]] ] +; CHECK-NEXT: [[TMP36:%.*]] = add <2 x i32> [[TMP35]], splat (i32 10) +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP37]], label %[[PRED_STORE_IF18:.*]], label %[[PRED_STORE_CONTINUE19:.*]] +; CHECK: [[PRED_STORE_IF18]]: +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[TMP36]], i32 0 +; CHECK-NEXT: store i32 [[TMP39]], ptr [[TMP38]], align 4, !alias.scope [[META19]], !noalias [[META12]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE19]] +; CHECK: [[PRED_STORE_CONTINUE19]]: +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP40]], label %[[PRED_STORE_IF20:.*]], label %[[PRED_STORE_CONTINUE21]] +; CHECK: [[PRED_STORE_IF20]]: +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[TMP36]], i32 1 +; CHECK-NEXT: store i32 [[TMP42]], ptr [[TMP41]], align 4, !alias.scope [[META19]], !noalias [[META12]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE21]] +; CHECK: [[PRED_STORE_CONTINUE21]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP43]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %c = load i32, ptr %gep.cond, align 4 + %c.0 = icmp ule i32 %c, 11 + br i1 %c.0, label %then, label %else + +then: + %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.then = load i32, ptr %gep.src.then, align 4 + %add = add i32 %l.src.then, 10 + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %add, ptr %gep.dst, align 4 + br label %loop.latch + +else: + %gep.src.else = getelementptr inbounds i32, ptr %src, i32 %iv + store i32 99, ptr %gep.src.else, align 4 + %l.src.else = load i32, ptr %gep.src.else, align 4 + %sub = sub i32 %l.src.else, 5 + %gep.dst.else = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %sub, ptr %gep.dst.else, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_noalias_store_via_runtime_checks(ptr %dst, ptr %dst.1, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @test_noalias_store_via_runtime_checks( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[DST_1:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST_1]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND04:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND15:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]] +; CHECK-NEXT: [[BOUND07:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND18:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT9:%.*]] = and i1 [[BOUND07]], [[BOUND18]] +; CHECK-NEXT: [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT9]] +; CHECK-NEXT: [[BOUND011:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND112:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT13:%.*]] = and i1 [[BOUND011]], [[BOUND112]] +; CHECK-NEXT: [[CONFLICT_RDX14:%.*]] = or i1 [[CONFLICT_RDX10]], [[FOUND_CONFLICT13]] +; CHECK-NEXT: [[BOUND015:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND116:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT17:%.*]] = and i1 [[BOUND015]], [[BOUND116]] +; CHECK-NEXT: [[CONFLICT_RDX18:%.*]] = or i1 [[CONFLICT_RDX14]], [[FOUND_CONFLICT17]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX18]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE30:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META22:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[DST_1]], i32 [[TMP4]] +; CHECK-NEXT: store i32 10, ptr [[TMP10]], align 4, !alias.scope [[META25:![0-9]+]], !noalias [[META27:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !alias.scope [[META30:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP13]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP15]], label %[[PRED_LOAD_IF19:.*]], label %[[PRED_LOAD_CONTINUE20:.*]] +; CHECK: [[PRED_LOAD_IF19]]: +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST_1]], i32 [[TMP5]] +; CHECK-NEXT: store i32 10, ptr [[TMP16]], align 4, !alias.scope [[META25]], !noalias [[META27]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4, !alias.scope [[META30]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP18]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE20]] +; CHECK: [[PRED_LOAD_CONTINUE20]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi <2 x i32> [ [[TMP14]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], %[[PRED_LOAD_IF19]] ] +; CHECK-NEXT: [[TMP21:%.*]] = sub <2 x i32> [[TMP20]], splat (i32 5) +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[TMP21]], i32 0 +; CHECK-NEXT: store i32 [[TMP24]], ptr [[TMP23]], align 4, !alias.scope [[META31:![0-9]+]], !noalias [[META32:![0-9]+]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP25]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] +; CHECK: [[PRED_STORE_IF21]]: +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x i32> [[TMP21]], i32 1 +; CHECK-NEXT: store i32 [[TMP27]], ptr [[TMP26]], align 4, !alias.scope [[META31]], !noalias [[META32]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE22]] +; CHECK: [[PRED_STORE_CONTINUE22]]: +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP28]], label %[[PRED_LOAD_IF23:.*]], label %[[PRED_LOAD_CONTINUE24:.*]] +; CHECK: [[PRED_LOAD_IF23]]: +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4, !alias.scope [[META30]] +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[TMP30]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE24]] +; CHECK: [[PRED_LOAD_CONTINUE24]]: +; CHECK-NEXT: [[TMP32:%.*]] = phi <2 x i32> [ poison, %[[PRED_STORE_CONTINUE22]] ], [ [[TMP31]], %[[PRED_LOAD_IF23]] ] +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP33]], label %[[PRED_LOAD_IF25:.*]], label %[[PRED_LOAD_CONTINUE26:.*]] +; CHECK: [[PRED_LOAD_IF25]]: +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4, !alias.scope [[META30]] +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP35]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE26]] +; CHECK: [[PRED_LOAD_CONTINUE26]]: +; CHECK-NEXT: [[TMP37:%.*]] = phi <2 x i32> [ [[TMP32]], %[[PRED_LOAD_CONTINUE24]] ], [ [[TMP36]], %[[PRED_LOAD_IF25]] ] +; CHECK-NEXT: [[TMP38:%.*]] = add <2 x i32> [[TMP37]], splat (i32 10) +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP39]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]] +; CHECK: [[PRED_STORE_IF27]]: +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0 +; CHECK-NEXT: store i32 [[TMP41]], ptr [[TMP40]], align 4, !alias.scope [[META31]], !noalias [[META32]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE28]] +; CHECK: [[PRED_STORE_CONTINUE28]]: +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP42]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30]] +; CHECK: [[PRED_STORE_IF29]]: +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1 +; CHECK-NEXT: store i32 [[TMP44]], ptr [[TMP43]], align 4, !alias.scope [[META31]], !noalias [[META32]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE30]] +; CHECK: [[PRED_STORE_CONTINUE30]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP45:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP45]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %c = load i32, ptr %gep.cond, align 4 + %c.0 = icmp ule i32 %c, 11 + br i1 %c.0, label %then, label %else + +then: + %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.then = load i32, ptr %gep.src.then, align 4 + %add = add i32 %l.src.then, 10 + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %add, ptr %gep.dst, align 4 + br label %loop.latch + +else: + %gep.dst.1.else = getelementptr inbounds i32, ptr %dst.1, i32 %iv + store i32 10, ptr %gep.dst.1.else, align 4 + %gep.src.else = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.else = load i32, ptr %gep.src.else, align 4 + %sub = sub i32 %l.src.else, 5 + %gep.dst.else = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %sub, ptr %gep.dst.else, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_memory_op_between_loads_alias(ptr %dst, ptr %src, ptr %cond, ptr %dst.1, i32 %n) { +; CHECK-LABEL: define void @test_memory_op_between_loads_alias( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], ptr [[DST_1:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: [[BOUND06:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND17:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT8:%.*]] = and i1 [[BOUND06]], [[BOUND17]] +; CHECK-NEXT: [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT8]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX9]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE17:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META35:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: store i32 0, ptr [[TMP10]], align 4, !alias.scope [[META38:![0-9]+]], !noalias [[META40:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !alias.scope [[META38]], !noalias [[META40]] +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11:.*]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: store i32 0, ptr [[TMP15]], align 4, !alias.scope [[META38]], !noalias [[META40]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !alias.scope [[META38]], !noalias [[META40]] +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP16]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i32> [[TMP18]], splat (i32 10) +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[TMP19]], i32 0 +; CHECK-NEXT: store i32 [[TMP22]], ptr [[TMP21]], align 4, !alias.scope [[META42:![0-9]+]], !noalias [[META35]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]] +; CHECK: [[PRED_STORE_IF12]]: +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i32> [[TMP19]], i32 1 +; CHECK-NEXT: store i32 [[TMP25]], ptr [[TMP24]], align 4, !alias.scope [[META42]], !noalias [[META35]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE13]] +; CHECK: [[PRED_STORE_CONTINUE13]]: +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]] +; CHECK: [[PRED_STORE_IF14]]: +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4, !alias.scope [[META38]], !noalias [[META40]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4, !alias.scope [[META42]], !noalias [[META35]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE15]] +; CHECK: [[PRED_STORE_CONTINUE15]]: +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17]] +; CHECK: [[PRED_STORE_IF16]]: +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !alias.scope [[META38]], !noalias [[META40]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: store i32 [[TMP32]], ptr [[TMP33]], align 4, !alias.scope [[META42]], !noalias [[META35]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE17]] +; CHECK: [[PRED_STORE_CONTINUE17]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %c = load i32, ptr %gep.cond, align 4 + %c.0 = icmp ule i32 %c, 11 + br i1 %c.0, label %then, label %middle + +middle: + %gep.src.middle = getelementptr inbounds i32, ptr %src, i32 %iv + store i32 0, ptr %gep.src.middle, align 4 + br label %else + +then: + %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.then = load i32, ptr %gep.src.then, align 4 + %gep.dst.then = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %l.src.then, ptr %gep.dst.then, align 4 + br label %loop.latch + +else: + %gep.src.else = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.else = load i32, ptr %gep.src.else, align 4 + %add = add i32 %l.src.else, 10 + %gep.dst.else = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %add, ptr %gep.dst.else, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_memory_op_between_loads_no_alias_via_rt_checks(ptr %dst, ptr %src, ptr %cond, ptr %dst.1, i32 %n) { +; CHECK-LABEL: define void @test_memory_op_between_loads_no_alias_via_rt_checks( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], ptr [[DST_1:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST_1]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND04:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND15:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]] +; CHECK-NEXT: [[BOUND07:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND18:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT9:%.*]] = and i1 [[BOUND07]], [[BOUND18]] +; CHECK-NEXT: [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT9]] +; CHECK-NEXT: [[BOUND011:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND112:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT13:%.*]] = and i1 [[BOUND011]], [[BOUND112]] +; CHECK-NEXT: [[CONFLICT_RDX14:%.*]] = or i1 [[CONFLICT_RDX10]], [[FOUND_CONFLICT13]] +; CHECK-NEXT: [[BOUND015:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND116:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT17:%.*]] = and i1 [[BOUND015]], [[BOUND116]] +; CHECK-NEXT: [[CONFLICT_RDX18:%.*]] = or i1 [[CONFLICT_RDX14]], [[FOUND_CONFLICT17]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX18]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE26:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META45:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[DST_1]], i32 [[TMP4]] +; CHECK-NEXT: store i32 0, ptr [[TMP10]], align 4, !alias.scope [[META48:![0-9]+]], !noalias [[META50:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !alias.scope [[META53:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP13]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP15]], label %[[PRED_LOAD_IF19:.*]], label %[[PRED_LOAD_CONTINUE20:.*]] +; CHECK: [[PRED_LOAD_IF19]]: +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST_1]], i32 [[TMP5]] +; CHECK-NEXT: store i32 0, ptr [[TMP16]], align 4, !alias.scope [[META48]], !noalias [[META50]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4, !alias.scope [[META53]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP18]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE20]] +; CHECK: [[PRED_LOAD_CONTINUE20]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi <2 x i32> [ [[TMP14]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], %[[PRED_LOAD_IF19]] ] +; CHECK-NEXT: [[TMP21:%.*]] = add <2 x i32> [[TMP20]], splat (i32 10) +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[TMP21]], i32 0 +; CHECK-NEXT: store i32 [[TMP24]], ptr [[TMP23]], align 4, !alias.scope [[META54:![0-9]+]], !noalias [[META55:![0-9]+]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP25]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] +; CHECK: [[PRED_STORE_IF21]]: +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x i32> [[TMP21]], i32 1 +; CHECK-NEXT: store i32 [[TMP27]], ptr [[TMP26]], align 4, !alias.scope [[META54]], !noalias [[META55]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE22]] +; CHECK: [[PRED_STORE_CONTINUE22]]: +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] +; CHECK: [[PRED_STORE_IF23]]: +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4, !alias.scope [[META53]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store i32 [[TMP30]], ptr [[TMP31]], align 4, !alias.scope [[META54]], !noalias [[META55]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE24]] +; CHECK: [[PRED_STORE_CONTINUE24]]: +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26]] +; CHECK: [[PRED_STORE_IF25]]: +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4, !alias.scope [[META53]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: store i32 [[TMP34]], ptr [[TMP35]], align 4, !alias.scope [[META54]], !noalias [[META55]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE26]] +; CHECK: [[PRED_STORE_CONTINUE26]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP56:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %c = load i32, ptr %gep.cond, align 4 + %c.0 = icmp ule i32 %c, 11 + br i1 %c.0, label %then, label %middle + +middle: + %gep.dst.1 = getelementptr inbounds i32, ptr %dst.1, i32 %iv + store i32 0, ptr %gep.dst.1, align 4 + br label %else + +then: + %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.then = load i32, ptr %gep.src.then, align 4 + %gep.dst.then = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %l.src.then, ptr %gep.dst.then, align 4 + br label %loop.latch + +else: + %gep.src.else = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.else = load i32, ptr %gep.src.else, align 4 + %add = add i32 %l.src.else, 10 + %gep.dst.else = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %add, ptr %gep.dst.else, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + diff --git a/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads.ll b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads.ll new file mode 100644 index 0000000000000..b30d010aaf9c9 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads.ll @@ -0,0 +1,1291 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6 +; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s + +define void @test(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE11:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x ptr> [[TMP10]], ptr [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP24]], align 4, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP34:%.*]] = xor <2 x i1> [[TMP15]], splat (i1 true) +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP34]], i32 0 +; CHECK-NEXT: br i1 [[TMP35]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope [[META3:![0-9]+]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP19]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i1> [[TMP34]], i32 1 +; CHECK-NEXT: br i1 [[TMP21]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]] +; CHECK: [[PRED_LOAD_IF6]]: +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META3]] +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP22]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE7]] +; CHECK: [[PRED_LOAD_CONTINUE7]]: +; CHECK-NEXT: [[TMP36:%.*]] = phi <2 x i32> [ [[TMP20]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP23]], %[[PRED_LOAD_IF6]] ] +; CHECK-NEXT: [[TMP25:%.*]] = add <2 x i32> [[TMP36]], splat (i32 10) +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x i1> [[TMP15]], i32 0 +; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]] +; CHECK: [[PRED_LOAD_IF8]]: +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope [[META3]] +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[TMP26]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE9]] +; CHECK: [[PRED_LOAD_CONTINUE9]]: +; CHECK-NEXT: [[TMP33:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE7]] ], [ [[TMP31]], %[[PRED_LOAD_IF8]] ] +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i1> [[TMP15]], i32 1 +; CHECK-NEXT: br i1 [[TMP32]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META3]] +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x i32> [[TMP33]], i32 [[TMP27]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP29:%.*]] = phi <2 x i32> [ [[TMP33]], %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP28]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP29]], <2 x i32> [[TMP25]] +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP37]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP60:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP60]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %else + +then: + %l.src = load i32, ptr %gep.src, align 4 + br label %loop.latch + +else: + %l.src.2 = load i32, ptr %gep.src, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ %add, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Negative test: Different addresses - should NOT hoist +define void @different_addresses(ptr %dst, ptr %src1, ptr %src2, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @different_addresses( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC15:%.*]] = ptrtoint ptr [[SRC1]] to i64 +; CHECK-NEXT: [[SRC23:%.*]] = ptrtoint ptr [[SRC2]] to i64 +; CHECK-NEXT: [[COND2:%.*]] = ptrtoint ptr [[COND]] to i64 +; CHECK-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[DST1]], [[COND2]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[DST1]], [[SRC23]] +; CHECK-NEXT: [[FOUND_CONFLICT6:%.*]] = icmp ult i64 [[TMP1]], 8 +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[DST1]], [[SRC15]] +; CHECK-NEXT: [[FOUND_CONFLICT9:%.*]] = icmp ult i64 [[TMP2]], 8 +; CHECK-NEXT: [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT9]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE13:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP16]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP12:%.*]] = xor <2 x i1> [[TMP11]], splat (i1 true) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0 +; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP20]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP19]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1 +; CHECK-NEXT: br i1 [[TMP18]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]] +; CHECK: [[PRED_LOAD_IF8]]: +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP25]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP29]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE9]] +; CHECK: [[PRED_LOAD_CONTINUE9]]: +; CHECK-NEXT: [[TMP22:%.*]] = phi <2 x i32> [ [[TMP17]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP21]], %[[PRED_LOAD_IF8]] ] +; CHECK-NEXT: [[TMP23:%.*]] = add <2 x i32> [[TMP22]], splat (i32 10) +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0 +; CHECK-NEXT: br i1 [[TMP28]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11:.*]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP49]], align 4 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> poison, i32 [[TMP24]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP32:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP30]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1 +; CHECK-NEXT: br i1 [[TMP31]], label %[[PRED_LOAD_IF12:.*]], label %[[PRED_LOAD_CONTINUE13]] +; CHECK: [[PRED_LOAD_IF12]]: +; CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP52]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP26]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE13]] +; CHECK: [[PRED_LOAD_CONTINUE13]]: +; CHECK-NEXT: [[TMP33:%.*]] = phi <2 x i32> [ [[TMP32]], %[[PRED_LOAD_CONTINUE11]] ], [ [[TMP27]], %[[PRED_LOAD_IF12]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP11]], <2 x i32> [[TMP33]], <2 x i32> [[TMP23]] +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP34]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP60:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP60]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src.1 = getelementptr inbounds i32, ptr %src1, i32 %iv + %gep.src.2 = getelementptr inbounds i32, ptr %src2, i32 %iv + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %else + +then: + %l.src = load i32, ptr %gep.src.1, align 4 + br label %loop.latch + +else: + %l.src.2 = load i32, ptr %gep.src.2, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ %add, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Negative test: Non-complementary masks - should NOT hoist +define void @non_complementary_masks(ptr %dst, ptr %src, ptr %cond1, ptr %cond2, i32 %n) { +; CHECK-LABEL: define void @non_complementary_masks( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND1:%.*]], ptr [[COND2:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND1]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[COND2]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND1]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND04:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND15:%.*]] = icmp ult ptr [[COND2]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]] +; CHECK-NEXT: [[BOUND07:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND18:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT9:%.*]] = and i1 [[BOUND07]], [[BOUND18]] +; CHECK-NEXT: [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT9]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE17:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x ptr> [[TMP10]], ptr [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[COND1]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[COND2]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP24]], align 4, !alias.scope [[META14:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <2 x i32>, ptr [[TMP25]], align 4, !alias.scope [[META17:![0-9]+]] +; CHECK-NEXT: [[TMP37:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP38:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD11]], splat (i32 20) +; CHECK-NEXT: [[TMP18:%.*]] = xor <2 x i1> [[TMP37]], splat (i1 true) +; CHECK-NEXT: [[TMP19:%.*]] = select <2 x i1> [[TMP18]], <2 x i1> [[TMP38]], <2 x i1> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP19]], i32 0 +; CHECK-NEXT: br i1 [[TMP20]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope [[META19:![0-9]+]] +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[TMP21]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP23:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP22]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP19]], i32 1 +; CHECK-NEXT: br i1 [[TMP39]], label %[[PRED_LOAD_IF12:.*]], label %[[PRED_LOAD_CONTINUE13:.*]] +; CHECK: [[PRED_LOAD_IF12]]: +; CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META19]] +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP40]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE13]] +; CHECK: [[PRED_LOAD_CONTINUE13]]: +; CHECK-NEXT: [[TMP27:%.*]] = phi <2 x i32> [ [[TMP23]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP26]], %[[PRED_LOAD_IF12]] ] +; CHECK-NEXT: [[TMP28:%.*]] = add <2 x i32> [[TMP27]], splat (i32 10) +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x i1> [[TMP37]], i32 0 +; CHECK-NEXT: br i1 [[TMP33]], label %[[PRED_LOAD_IF14:.*]], label %[[PRED_LOAD_CONTINUE15:.*]] +; CHECK: [[PRED_LOAD_IF14]]: +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope [[META19]] +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <2 x i32> poison, i32 [[TMP29]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE15]] +; CHECK: [[PRED_LOAD_CONTINUE15]]: +; CHECK-NEXT: [[TMP36:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE13]] ], [ [[TMP34]], %[[PRED_LOAD_IF14]] ] +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP37]], i32 1 +; CHECK-NEXT: br i1 [[TMP35]], label %[[PRED_LOAD_IF16:.*]], label %[[PRED_LOAD_CONTINUE17]] +; CHECK: [[PRED_LOAD_IF16]]: +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META19]] +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> [[TMP36]], i32 [[TMP30]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE17]] +; CHECK: [[PRED_LOAD_CONTINUE17]]: +; CHECK-NEXT: [[TMP32:%.*]] = phi <2 x i32> [ [[TMP36]], %[[PRED_LOAD_CONTINUE15]] ], [ [[TMP31]], %[[PRED_LOAD_IF16]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP19]], <2 x i32> [[TMP28]], <2 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI18:%.*]] = select <2 x i1> [[TMP37]], <2 x i32> [[TMP32]], <2 x i32> [[PREDPHI]] +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI18]], ptr [[TMP41]], align 4, !alias.scope [[META21:![0-9]+]], !noalias [[META23:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP63:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP63]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv + %gep.cond1 = getelementptr inbounds i32, ptr %cond1, i32 %iv + %gep.cond2 = getelementptr inbounds i32, ptr %cond2, i32 %iv + %l.c1 = load i32, ptr %gep.cond1 + %l.c2 = load i32, ptr %gep.cond2 + %c1 = icmp ule i32 %l.c1, 11 + %c2 = icmp ule i32 %l.c2, 20 + br i1 %c1, label %then, label %else + +then: + %l.src = load i32, ptr %gep.src, align 4 + br label %loop.latch + +else: + br i1 %c2, label %else.then, label %loop.latch + +else.then: + %l.src.2 = load i32, ptr %gep.src, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ %add, %else.then ], [ 0, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Negative test: Different access sizes - should NOT hoist +; Both loads use the same pointer but have different types (i8 vs i32) +define void @different_access_sizes(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @different_access_sizes( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE11:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x ptr> [[TMP8]], ptr [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP10]], align 4, !alias.scope [[META26:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP12:%.*]] = xor <2 x i1> [[TMP11]], splat (i1 true) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0 +; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP6]], align 4, !alias.scope [[META29:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP14]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP15]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1 +; CHECK-NEXT: br i1 [[TMP17]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]] +; CHECK: [[PRED_LOAD_IF6]]: +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP7]], align 4, !alias.scope [[META29]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP16]], i32 [[TMP18]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE7]] +; CHECK: [[PRED_LOAD_CONTINUE7]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi <2 x i32> [ [[TMP16]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], %[[PRED_LOAD_IF6]] ] +; CHECK-NEXT: [[TMP21:%.*]] = add <2 x i32> [[TMP20]], splat (i32 10) +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0 +; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]] +; CHECK: [[PRED_LOAD_IF8]]: +; CHECK-NEXT: [[TMP23:%.*]] = load i8, ptr [[TMP6]], align 4, !alias.scope [[META29]] +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <2 x i8> poison, i8 [[TMP23]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE9]] +; CHECK: [[PRED_LOAD_CONTINUE9]]: +; CHECK-NEXT: [[TMP25:%.*]] = phi <2 x i8> [ poison, %[[PRED_LOAD_CONTINUE7]] ], [ [[TMP24]], %[[PRED_LOAD_IF8]] ] +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1 +; CHECK-NEXT: br i1 [[TMP26]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP27:%.*]] = load i8, ptr [[TMP7]], align 4, !alias.scope [[META29]] +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x i8> [[TMP25]], i8 [[TMP27]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP29:%.*]] = phi <2 x i8> [ [[TMP25]], %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP28]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[TMP30:%.*]] = zext <2 x i8> [[TMP29]] to <2 x i32> +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP11]], <2 x i32> [[TMP30]], <2 x i32> [[TMP21]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP31]], align 4, !alias.scope [[META31:![0-9]+]], !noalias [[META33:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %else + +then: + %l.src = load i8, ptr %gep.src, align 4 + %ext = zext i8 %l.src to i32 + br label %loop.latch + +else: + %l.src.2 = load i32, ptr %gep.src, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %ext, %then ], [ %add, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Positive test: Same address with different alignments - should hoist with minimum alignment +define void @different_alignments_same_address(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @different_alignments_same_address( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE11:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x ptr> [[TMP10]], ptr [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP20]], align 4, !alias.scope [[META36:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP16:%.*]] = xor <2 x i1> [[TMP15]], splat (i1 true) +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP16]], i32 0 +; CHECK-NEXT: br i1 [[TMP17]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope [[META39:![0-9]+]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP35:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP19]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i1> [[TMP16]], i32 1 +; CHECK-NEXT: br i1 [[TMP21]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]] +; CHECK: [[PRED_LOAD_IF6]]: +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META39]] +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i32> [[TMP35]], i32 [[TMP22]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE7]] +; CHECK: [[PRED_LOAD_CONTINUE7]]: +; CHECK-NEXT: [[TMP24:%.*]] = phi <2 x i32> [ [[TMP35]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP23]], %[[PRED_LOAD_IF6]] ] +; CHECK-NEXT: [[TMP25:%.*]] = add <2 x i32> [[TMP24]], splat (i32 10) +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x i1> [[TMP15]], i32 0 +; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]] +; CHECK: [[PRED_LOAD_IF8]]: +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP8]], align 2, !alias.scope [[META39]] +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[TMP26]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE9]] +; CHECK: [[PRED_LOAD_CONTINUE9]]: +; CHECK-NEXT: [[TMP33:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE7]] ], [ [[TMP31]], %[[PRED_LOAD_IF8]] ] +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i1> [[TMP15]], i32 1 +; CHECK-NEXT: br i1 [[TMP32]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP9]], align 2, !alias.scope [[META39]] +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x i32> [[TMP33]], i32 [[TMP27]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP29:%.*]] = phi <2 x i32> [ [[TMP33]], %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP28]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP29]], <2 x i32> [[TMP25]] +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP34]], align 4, !alias.scope [[META41:![0-9]+]], !noalias [[META43:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP48:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %else + +then: + %l.src = load i32, ptr %gep.src, align 2 + br label %loop.latch + +else: + %l.src.2 = load i32, ptr %gep.src, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ %add, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Negative test: Volatile loads - should NOT hoist +define void @volatile_load(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @volatile_load( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]] +; CHECK-NEXT: [[GEP_COND:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[IV]] +; CHECK-NEXT: [[L_C:%.*]] = load i32, ptr [[GEP_COND]], align 4 +; CHECK-NEXT: [[C:%.*]] = icmp ule i32 [[L_C]], 11 +; CHECK-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[L_SRC:%.*]] = load volatile i32, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[L_SRC_2]], 10 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[MERGE:%.*]] = phi i32 [ [[L_SRC]], %[[THEN]] ], [ [[ADD]], %[[ELSE]] ] +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]] +; CHECK-NEXT: store i32 [[MERGE]], ptr [[GEP_DST]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %else + +then: + %l.src = load volatile i32, ptr %gep.src, align 4 + br label %loop.latch + +else: + %l.src.2 = load i32, ptr %gep.src, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ %add, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Test hoisting with duplicate GEPs: The same address is computed by different +; GEP instructions in different branches. The hoisting pass should use SCEV to +; recognize they compute the same address and hoist the load. +define void @duplicate_gep(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @duplicate_gep( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE11:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META46:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !alias.scope [[META49:![0-9]+]] +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]] +; CHECK: [[PRED_LOAD_IF6]]: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !alias.scope [[META49]] +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP16]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE7]] +; CHECK: [[PRED_LOAD_CONTINUE7]]: +; CHECK-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF6]] ] +; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i32> [[TMP18]], splat (i32 10) +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP20]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]] +; CHECK: [[PRED_LOAD_IF8]]: +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4, !alias.scope [[META49]] +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE9]] +; CHECK: [[PRED_LOAD_CONTINUE9]]: +; CHECK-NEXT: [[TMP24:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE7]] ], [ [[TMP23]], %[[PRED_LOAD_IF8]] ] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP25]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4, !alias.scope [[META49]] +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP27]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP29:%.*]] = phi <2 x i32> [ [[TMP24]], %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP28]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i32> [[TMP29]], <2 x i32> [[TMP19]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP30]], align 4, !alias.scope [[META51:![0-9]+]], !noalias [[META53:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %else + +then: + %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src = load i32, ptr %gep.src.then, align 4 + br label %loop.latch + +else: + %gep.src.else= getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.2 = load i32, ptr %gep.src.else, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ %add, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Test with non-unit-stride loads: Loads have stride 16 (2 doubles * 8 bytes) +; instead of unit stride (8 bytes). The hoisting optimization should still work +; since both loads access the same address with the same stride. +define void @non_unit_stride_i64(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @non_unit_stride_i64( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP1]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 4 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP5]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE11:.*]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP6]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4, !alias.scope [[META56:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP10:%.*]] = xor <2 x i1> [[TMP9]], splat (i1 true) +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0 +; CHECK-NEXT: br i1 [[TMP11]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !alias.scope [[META59:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> poison, i32 [[TMP13]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP14]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 +; CHECK-NEXT: br i1 [[TMP16]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]] +; CHECK: [[PRED_LOAD_IF6]]: +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4, !alias.scope [[META59]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP18]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE7]] +; CHECK: [[PRED_LOAD_CONTINUE7]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi <2 x i32> [ [[TMP15]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], %[[PRED_LOAD_IF6]] ] +; CHECK-NEXT: [[TMP21:%.*]] = add <2 x i32> [[TMP20]], splat (i32 10) +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0 +; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]] +; CHECK: [[PRED_LOAD_IF8]]: +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[TMP6]] +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4, !alias.scope [[META59]] +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP24]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE9]] +; CHECK: [[PRED_LOAD_CONTINUE9]]: +; CHECK-NEXT: [[TMP26:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE7]] ], [ [[TMP25]], %[[PRED_LOAD_IF8]] ] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1 +; CHECK-NEXT: br i1 [[TMP27]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4, !alias.scope [[META59]] +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> [[TMP26]], i32 [[TMP29]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP31:%.*]] = phi <2 x i32> [ [[TMP26]], %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP30]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP9]], <2 x i32> [[TMP31]], <2 x i32> [[TMP21]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP6]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP32]], align 4, !alias.scope [[META61:![0-9]+]], !noalias [[META63:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP33:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP33]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP64:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %else + +then: + %gep.src.then = getelementptr inbounds i64, ptr %src, i32 %iv + %l.src = load i32, ptr %gep.src.then, align 4 + br label %loop.latch + +else: + %gep.src.else= getelementptr inbounds i64, ptr %src, i32 %iv + %l.src.2 = load i32, ptr %gep.src.else, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ %add, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + + +; Test that loads inside masked regions (without individual masks) are +; correctly detected and hoisted when they have complementary predicates. +define void @hoist_loads_in_masked_regions(ptr noalias %dst, ptr noalias %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @hoist_loads_in_masked_regions( +; CHECK-SAME: ptr noalias [[DST:%.*]], ptr noalias [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, ptr [[SRC]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> [[WIDE_LOAD1]], <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[INDEX]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP3]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP66:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %loop.latch + +then: + %l.src = load i32, ptr %gep.src, align 4 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ 0, %loop ] + %l.src.2 = load i32, ptr %gep.src, align 4 + %add = add i32 %l.src.2, %merge + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} +; Test that when there are 3 or more regions with complementary predicates +; loading from the same address, all loads are hoisted and replaced, not just +; the first pair. This tests the K loop that continues searching after finding +; the initial complementary pair. +define void @hoist_multiple_complementary_loads(ptr noalias %dst, ptr noalias %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @hoist_multiple_complementary_loads( +; CHECK-SAME: ptr noalias [[DST:%.*]], ptr noalias [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE10:.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP0]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP43]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <2 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP64:%.*]] = load i32, ptr [[TMP63]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP64]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]] +; CHECK: [[PRED_LOAD_IF1]]: +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP70:%.*]] = load i32, ptr [[TMP69]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP70]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE2]] +; CHECK: [[PRED_LOAD_CONTINUE2]]: +; CHECK-NEXT: [[TMP28:%.*]] = phi <2 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP15:%.*]] = add <2 x i32> [[TMP28]], splat (i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <2 x i32> [[WIDE_LOAD]], splat (i32 32) +; CHECK-NEXT: [[TMP29:%.*]] = xor <2 x i1> [[TMP16]], splat (i1 true) +; CHECK-NEXT: [[TMP32:%.*]] = select <2 x i1> [[TMP7]], <2 x i1> [[TMP29]], <2 x i1> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP32]], i32 0 +; CHECK-NEXT: br i1 [[TMP19]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]] +; CHECK: [[PRED_LOAD_IF3]]: +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <2 x i32> poison, i32 [[TMP21]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE4]] +; CHECK: [[PRED_LOAD_CONTINUE4]]: +; CHECK-NEXT: [[TMP23:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE2]] ], [ [[TMP33]], %[[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i1> [[TMP32]], i32 1 +; CHECK-NEXT: br i1 [[TMP24]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]] +; CHECK: [[PRED_LOAD_IF5]]: +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP26]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE6]] +; CHECK: [[PRED_LOAD_CONTINUE6]]: +; CHECK-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ [[TMP23]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP27]], %[[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP22:%.*]] = mul <2 x i32> [[TMP18]], splat (i32 2) +; CHECK-NEXT: [[TMP30:%.*]] = select <2 x i1> [[TMP7]], <2 x i1> [[TMP16]], <2 x i1> zeroinitializer +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP30]], i32 0 +; CHECK-NEXT: br i1 [[TMP31]], label %[[PRED_LOAD_IF7:.*]], label %[[PRED_LOAD_CONTINUE8:.*]] +; CHECK: [[PRED_LOAD_IF7]]: +; CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr [[TMP61]], align 4 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <2 x i32> poison, i32 [[TMP34]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE8]] +; CHECK: [[PRED_LOAD_CONTINUE8]]: +; CHECK-NEXT: [[TMP35:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE6]] ], [ [[TMP38]], %[[PRED_LOAD_IF7]] ] +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <2 x i1> [[TMP30]], i32 1 +; CHECK-NEXT: br i1 [[TMP36]], label %[[PRED_LOAD_IF9:.*]], label %[[PRED_LOAD_CONTINUE10]] +; CHECK: [[PRED_LOAD_IF9]]: +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP37:%.*]] = load i32, ptr [[TMP65]], align 4 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <2 x i32> [[TMP35]], i32 [[TMP37]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE10]] +; CHECK: [[PRED_LOAD_CONTINUE10]]: +; CHECK-NEXT: [[TMP45:%.*]] = phi <2 x i32> [ [[TMP35]], %[[PRED_LOAD_CONTINUE8]] ], [ [[TMP44]], %[[PRED_LOAD_IF9]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP32]], <2 x i32> [[TMP22]], <2 x i32> [[TMP15]] +; CHECK-NEXT: [[TMP42:%.*]] = select <2 x i1> [[TMP30]], <2 x i32> [[TMP45]], <2 x i32> [[PREDPHI]] +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP39]], i64 32 +; CHECK-NEXT: store <2 x i32> [[TMP42]], ptr [[TMP40]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP41:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP68:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.cond = load i32, ptr %gep.cond, align 4 + %c.1 = icmp ne i32 %l.cond, 0 + br i1 %c.1, label %check2, label %region3 + +check2: + %c.2 = icmp ne i32 %l.cond, 32 + br i1 %c.2, label %region1, label %region2 + +region1: + %gep.src.8.r1 = getelementptr inbounds i8, ptr %src, i32 %iv + %val1 = load i32, ptr %gep.src.8.r1, align 4 + br label %loop.latch + +region2: + %gep.src.8.r2 = getelementptr inbounds i8, ptr %src, i32 %iv + %val2 = load i32, ptr %gep.src.8.r2, align 4 + %mul = mul i32 %val2, 2 + br label %loop.latch + +region3: + %gep.src.8.r3 = getelementptr inbounds i8, ptr %src, i32 %iv + %val3 = load i32, ptr %gep.src.8.r3, align 4 + %add = add i32 %val3, 1 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %val1, %region1 ], [ %mul, %region2 ], [ %add, %region3 ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + %offset.dst = getelementptr inbounds i8, ptr %gep.dst, i64 32 + store i32 %merge, ptr %offset.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @hoist_predicated_load_with_chained_geps1(ptr %dst, ptr %src, i1 %cond) { +; CHECK-LABEL: define void @hoist_predicated_load_with_chained_geps1( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 2 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 8 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 2210 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[COND]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE8:.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: br i1 [[TMP3]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i64 8 +; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2, !alias.scope [[META70:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i16> poison, i16 [[TMP6]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i16> [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]] +; CHECK: [[PRED_LOAD_IF3]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i64 8 +; CHECK-NEXT: [[TMP12:%.*]] = load i16, ptr [[TMP11]], align 2, !alias.scope [[META70]] +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i16> [[TMP8]], i16 [[TMP12]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE4]] +; CHECK: [[PRED_LOAD_CONTINUE4]]: +; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP8]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], %[[PRED_LOAD_IF3]] ] +; CHECK-NEXT: br i1 [[COND]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]] +; CHECK: [[PRED_LOAD_IF5]]: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP15]], i64 8 +; CHECK-NEXT: [[TMP17:%.*]] = load i16, ptr [[TMP16]], align 2, !alias.scope [[META70]] +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i16> poison, i16 [[TMP17]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE6]] +; CHECK: [[PRED_LOAD_CONTINUE6]]: +; CHECK-NEXT: [[TMP19:%.*]] = phi <2 x i16> [ poison, %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP18]], %[[PRED_LOAD_IF5]] ] +; CHECK-NEXT: br i1 [[COND]], label %[[PRED_LOAD_IF7:.*]], label %[[PRED_LOAD_CONTINUE8]] +; CHECK: [[PRED_LOAD_IF7]]: +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i64 8 +; CHECK-NEXT: [[TMP22:%.*]] = load i16, ptr [[TMP21]], align 2, !alias.scope [[META70]] +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i16> [[TMP19]], i16 [[TMP22]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE8]] +; CHECK: [[PRED_LOAD_CONTINUE8]]: +; CHECK-NEXT: [[TMP24:%.*]] = phi <2 x i16> [ [[TMP19]], %[[PRED_LOAD_CONTINUE6]] ], [ [[TMP23]], %[[PRED_LOAD_IF7]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[COND]], <2 x i16> [[TMP24]], <2 x i16> [[TMP14]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1 +; CHECK-NEXT: store i16 [[TMP25]], ptr [[DST]], align 2, !alias.scope [[META73:![0-9]+]], !noalias [[META70]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP75:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + br i1 %cond, label %then, label %else + +then: + %gep1 = getelementptr [11 x i16], ptr %src, i64 %iv + %gep2 = getelementptr i8, ptr %gep1, i64 8 + %l.0 = load i16, ptr %gep2, align 2 + br label %loop.latch + +else: + %gep3 = getelementptr [11 x i16], ptr %src, i64 %iv + %gep4 = getelementptr i8, ptr %gep3, i64 8 + %l.1 = load i16, ptr %gep4, align 2 + br label %loop.latch + +loop.latch: + %merge = phi i16 [ %l.0, %then ], [ %l.1, %else ] + store i16 %merge, ptr %dst, align 2 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, 100 + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} + +define void @hoist_predicated_load_with_chained_geps2(ptr %dst, ptr %src, i1 %cond) { +; CHECK-LABEL: define void @hoist_predicated_load_with_chained_geps2( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 2 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 8 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 2210 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[COND]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE8:.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x ptr> [[TMP5]], ptr [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8 +; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[TMP8]], align 2, !alias.scope [[META77:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i16> poison, i16 [[TMP9]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP11:%.*]] = phi <2 x i16> [ poison, %[[VECTOR_BODY]] ], [ [[TMP10]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 +; CHECK-NEXT: br i1 [[TMP12]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]] +; CHECK: [[PRED_LOAD_IF3]]: +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP4]], i64 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i16, ptr [[TMP13]], align 2, !alias.scope [[META77]] +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i16> [[TMP11]], i16 [[TMP14]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE4]] +; CHECK: [[PRED_LOAD_CONTINUE4]]: +; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i16> [ [[TMP11]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP15]], %[[PRED_LOAD_IF3]] ] +; CHECK-NEXT: br i1 [[COND]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]] +; CHECK: [[PRED_LOAD_IF5]]: +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8 +; CHECK-NEXT: [[TMP18:%.*]] = load i16, ptr [[TMP17]], align 2, !alias.scope [[META77]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i16> poison, i16 [[TMP18]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE6]] +; CHECK: [[PRED_LOAD_CONTINUE6]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi <2 x i16> [ poison, %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP19]], %[[PRED_LOAD_IF5]] ] +; CHECK-NEXT: br i1 [[COND]], label %[[PRED_LOAD_IF7:.*]], label %[[PRED_LOAD_CONTINUE8]] +; CHECK: [[PRED_LOAD_IF7]]: +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP4]], i64 8 +; CHECK-NEXT: [[TMP22:%.*]] = load i16, ptr [[TMP21]], align 2, !alias.scope [[META77]] +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i16> [[TMP20]], i16 [[TMP22]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE8]] +; CHECK: [[PRED_LOAD_CONTINUE8]]: +; CHECK-NEXT: [[TMP24:%.*]] = phi <2 x i16> [ [[TMP20]], %[[PRED_LOAD_CONTINUE6]] ], [ [[TMP23]], %[[PRED_LOAD_IF7]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[COND]], <2 x i16> [[TMP24]], <2 x i16> [[TMP16]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1 +; CHECK-NEXT: store i16 [[TMP25]], ptr [[DST]], align 2, !alias.scope [[META80:![0-9]+]], !noalias [[META77]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP82:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep1 = getelementptr [11 x i16], ptr %src, i64 %iv + br i1 %cond, label %then, label %else + +then: + %gep2 = getelementptr i8, ptr %gep1, i64 8 + %l.0 = load i16, ptr %gep2, align 2 + br label %loop.latch + +else: + %gep3 = getelementptr i8, ptr %gep1, i64 8 + %l.1 = load i16, ptr %gep3, align 2 + br label %loop.latch + +loop.latch: + %merge = phi i16 [ %l.0, %then ], [ %l.1, %else ] + store i16 %merge, ptr %dst, align 2 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, 100 + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} From 196ea57c9d481ce96af8cddf393149b8007396ea Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 11 Nov 2025 15:00:13 -0800 Subject: [PATCH 62/64] workflows/libclang-abi-tests: Use new container (#167459) --- .github/workflows/libclang-abi-tests.yml | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml index 6377dd53d1f6c..b92b61de05088 100644 --- a/.github/workflows/libclang-abi-tests.yml +++ b/.github/workflows/libclang-abi-tests.yml @@ -84,6 +84,8 @@ jobs: if: github.repository_owner == 'llvm' needs: abi-dump-setup runs-on: ubuntu-24.04 + container: + image: "ghcr.io/llvm/ci-ubuntu-24.04-abi-tests@sha256:f80125c0f767e29b8616210c0fd5cea2cd1f4fb6f2ca86d89f6016b6329b8d7f" #ghcr.io/llvm/ci-ubuntu-24.04-abi-tests:9524b37c503f strategy: matrix: name: @@ -101,17 +103,6 @@ jobs: steps: - name: Install Ninja uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main - - name: Install abi-compliance-checker - run: | - sudo apt-get update - sudo apt-get install -y abi-dumper autoconf pkg-config - - name: Install universal-ctags - run: | - git clone https://github.com/universal-ctags/ctags.git - cd ctags - ./autogen.sh - ./configure - sudo make install - name: Download source code uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: @@ -139,6 +130,8 @@ jobs: abi-compare: if: github.repository_owner == 'llvm' runs-on: ubuntu-24.04 + container: + image: "ghcr.io/llvm/ci-ubuntu-24.04-abi-tests@sha256:f80125c0f767e29b8616210c0fd5cea2cd1f4fb6f2ca86d89f6016b6329b8d7f" #ghcr.io/llvm/ci-ubuntu-24.04-abi-tests:9524b37c503f needs: - abi-dump-setup - abi-dump @@ -154,10 +147,6 @@ jobs: name: build-latest path: build-latest - - name: Install abi-compliance-checker - run: | - sudo apt-get update - sudo apt-get install -y abi-compliance-checker - name: Compare ABI run: | for lib in ${{ needs.abi-dump-setup.outputs.ABI_LIBS }}; do From 2bf92787dfa5e59bfbc62edd397ecff051991ed3 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 11 Nov 2025 15:07:59 -0800 Subject: [PATCH 63/64] AMDGPU: Start using RegClassByHwMode for wavesize operands (#159884) This eliminates the pseudo registerclasses used to hack the wave register class, which are now replaced with RegClassByHwMode, so most of the diff is from register class ID renumbering. --- llvm/lib/Target/AMDGPU/AMDGPU.td | 35 +- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 3 - llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td | 2 +- .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 20 +- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 7 - llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp | 2 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 5 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 95 +++-- llvm/lib/Target/AMDGPU/VOPCInstructions.td | 10 +- llvm/lib/Target/AMDGPU/VOPInstructions.td | 4 +- .../GlobalISel/irtranslator-inline-asm.ll | 32 +- .../regbankcombiner-ignore-copies-crash.mir | 4 +- .../AMDGPU/branch-relax-indirect-branch.mir | 4 +- .../AMDGPU/branch-relax-no-terminators.mir | 4 +- .../coalesce-copy-to-agpr-to-av-registers.mir | 240 ++++++------ .../AMDGPU/coalescer-early-clobber-subreg.mir | 16 +- llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir | 24 +- llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir | 4 +- ...class-vgpr-mfma-to-av-with-load-source.mir | 12 +- llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll | 24 +- ...local-stack-alloc-add-references.gfx10.mir | 30 +- .../local-stack-alloc-add-references.gfx8.mir | 360 +++++++++--------- .../local-stack-alloc-add-references.gfx9.mir | 180 ++++----- .../machine-scheduler-sink-trivial-remats.mir | 4 +- llvm/test/CodeGen/AMDGPU/mai-hazards.mir | 6 +- ...al-regcopy-and-spill-missed-at-regalloc.ll | 24 +- ...lloc-failure-overlapping-insert-assert.mir | 16 +- .../AMDGPU/rename-independent-subregs.mir | 4 +- .../rewrite-vgpr-mfma-to-agpr-copy-from.mir | 4 +- ...gpr-mfma-to-agpr-subreg-insert-extract.mir | 12 +- ...te-vgpr-mfma-to-agpr-subreg-src2-chain.mir | 36 +- ...ssert-dead-def-subreg-use-other-subreg.mir | 4 +- ...dleMoveUp-subreg-def-across-subreg-def.mir | 16 +- .../CodeGen/AMDGPU/spill-vector-superclass.ll | 4 +- ...ubreg-undef-def-with-other-subreg-defs.mir | 24 +- .../Inputs/amdgpu_isel.ll.expected | 4 +- 36 files changed, 672 insertions(+), 603 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 0b61adf409948..b008354cfd462 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -2976,15 +2976,46 @@ def HasSetPrioIncWgInst : Predicate<"Subtarget->hasSetPrioIncWgInst()">, def NeedsAlignedVGPRs : Predicate<"Subtarget->needsAlignedVGPRs()">, AssemblerPredicate<(all_of FeatureRequiresAlignedVGPRs)>; +def NotNeedsAlignedVGPRs : Predicate<"!Subtarget->needsAlignedVGPRs()">, + AssemblerPredicate<(all_of (not FeatureRequiresAlignedVGPRs))>; + +def isWave32 : Predicate<"Subtarget->isWave32()">, + AssemblerPredicate <(any_of FeatureWavefrontSize32, + FeatureAssemblerPermissiveWavesize)>; +def isWave64 : Predicate<"Subtarget->isWave64()">, + AssemblerPredicate <(any_of FeatureWavefrontSize64, + FeatureAssemblerPermissiveWavesize)>; + +def isWave32Strict : Predicate<"Subtarget->isWave32()">, + AssemblerPredicate <(all_of FeatureWavefrontSize32)>; +def isWave64Strict : Predicate<"Subtarget->isWave64()">, + AssemblerPredicate <(all_of FeatureWavefrontSize64)>; + //===----------------------------------------------------------------------===// // HwModes //===----------------------------------------------------------------------===// -// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement +defvar DefaultMode_Wave64 = DefaultMode; +defvar DefaultMode_Wave32 = HwMode<[isWave32, NotNeedsAlignedVGPRs]>; + +// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement. Implied +// wave64. def AVAlign2LoadStoreMode : HwMode<[HasMAIInsts, NeedsAlignedVGPRs]>; // gfx1250, has alignment requirement but no AGPRs. -def AlignedVGPRNoAGPRMode : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs]>; +def AlignedVGPRNoAGPRMode_Wave32 : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs, isWave32Strict]>; +def AlignedVGPRNoAGPRMode_Wave64 : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs, isWave64Strict]>; + +// FIXME: This should be able to only define a separate hwmode that +// only depends on wavesize for just ValueTypes. These use different +// HwMode namespaces. If we don't define the full set of modes used +// for RegClassByHwMode, tablegen crashes for some reason +def WaveSizeVT : ValueTypeByHwMode<[ + DefaultMode_Wave64, + AVAlign2LoadStoreMode, + AlignedVGPRNoAGPRMode_Wave64, + DefaultMode_Wave32, + AlignedVGPRNoAGPRMode_Wave32], [i64, i64, i64, i32, i32]>; // Include AMDGPU TD files diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index dbca5afeca816..90d319f578f44 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -287,9 +287,6 @@ unsigned AMDGPURegisterBankInfo::getBreakDownCost( const RegisterBank & AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, LLT Ty) const { - if (&RC == &AMDGPU::SReg_1RegClass) - return AMDGPU::VCCRegBank; - // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a // VCC-like use. if (TRI->isSGPRClass(&RC)) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td index 1c1a6dac75a17..c37d3096afd3e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td @@ -15,7 +15,7 @@ def VGPRRegBank : RegisterBank<"VGPR", >; // It is helpful to distinguish conditions from ordinary SGPRs. -def VCCRegBank : RegisterBank <"VCC", [SReg_1]>; +def VCCRegBank : RegisterBank<"VCC", [SReg_32, SReg_64]>; def AGPRRegBank : RegisterBank <"AGPR", [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_288, AReg_320, AReg_352, AReg_384, AReg_512, AReg_1024] diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 207c1da56ca59..8ef5874d7baf9 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -795,14 +795,24 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, // Intention: print disassembler message when invalid code is decoded, // for example sgpr register used in VReg or VISrc(VReg or imm) operand. const MCOperandInfo &OpInfo = Desc.operands()[OpNo]; - int16_t RCID = MII.getOpRegClassID( - OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo)); - if (RCID != -1) { + if (OpInfo.RegClass != -1) { + int16_t RCID = MII.getOpRegClassID( + OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo)); const MCRegisterClass &RC = MRI.getRegClass(RCID); auto Reg = mc2PseudoReg(Op.getReg()); if (!RC.contains(Reg) && !isInlineValue(Reg)) { - O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC) - << "\' register class*/"; + bool IsWaveSizeOp = OpInfo.isLookupRegClassByHwMode() && + (OpInfo.RegClass == AMDGPU::SReg_1 || + OpInfo.RegClass == AMDGPU::SReg_1_XEXEC); + // Suppress this comment for a mismatched wavesize. Some users expect to + // be able to assemble and disassemble modules with mixed wavesizes, but + // we do not know the subtarget in different functions in MC. + // + // TODO: Should probably print it anyway, maybe a more specific version. + if (!IsWaveSizeOp) { + O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC) + << "\' register class*/"; + } } } } else if (Op.isImm()) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 0bde5d3fd2f26..42e73ec070c15 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -6,13 +6,6 @@ // //===----------------------------------------------------------------------===// -def isWave32 : Predicate<"Subtarget->isWave32()">, - AssemblerPredicate <(any_of FeatureWavefrontSize32, - FeatureAssemblerPermissiveWavesize)>; -def isWave64 : Predicate<"Subtarget->isWave64()">, - AssemblerPredicate <(any_of FeatureWavefrontSize64, - FeatureAssemblerPermissiveWavesize)>; - class AMDGPUMnemonicAlias : MnemonicAlias, PredicateControl; diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp index 96131bd591a17..9b710013a09ce 100644 --- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -75,7 +75,7 @@ Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF, bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) { assert(Changed || ConstrainRegs.empty()); for (Register Reg : ConstrainRegs) - MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); + MRI->constrainRegClass(Reg, TII->getRegisterInfo().getWaveMaskRegClass()); ConstrainRegs.clear(); return Changed; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index ff3491d193460..8fba74831811f 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3908,13 +3908,10 @@ const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { : &AMDGPU::VReg_64RegClass; } +// FIXME: This should be deleted const TargetRegisterClass * SIRegisterInfo::getRegClass(unsigned RCID) const { switch ((int)RCID) { - case AMDGPU::SReg_1RegClassID: - return getBoolRC(); - case AMDGPU::SReg_1_XEXECRegClassID: - return getWaveMaskRegClass(); case -1: return nullptr; default: diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index fc8f46a0d2b93..abe12c17ae76c 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -896,20 +896,6 @@ def SReg_64_Encodable : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v let Size = 64; } -def SReg_1_XEXEC : SIRegisterClass<"AMDGPU", [i1], 32, - (add SReg_64_XEXEC, SReg_32_XEXEC)> { - let CopyCost = 1; - let isAllocatable = 0; - let HasSGPR = 1; -} - -def SReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, - (add SReg_1_XEXEC, EXEC, EXEC_LO, EXEC_HI)> { - let CopyCost = 1; - let isAllocatable = 0; - let HasSGPR = 1; -} - multiclass SRegClass regTypes, SIRegisterTuples regList, @@ -1205,6 +1191,34 @@ defm AV_512 : AVRegClass<16, VReg_512.RegTypes, (add VGPR_512), (add AGPR_512)>; defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_1024)>; } +def SReg_1_XEXEC : SIRegisterClassLike<0, false, false, true>, + RegClassByHwMode< + [DefaultMode_Wave64, + AlignedVGPRNoAGPRMode_Wave64, + AVAlign2LoadStoreMode, + DefaultMode_Wave32, + AlignedVGPRNoAGPRMode_Wave32], + [SReg_64_XEXEC, + SReg_64_XEXEC, + SReg_64_XEXEC, + SReg_32_XM0_XEXEC, // FIXME: Why do the wave32 cases exclude m0? + SReg_32_XM0_XEXEC] +>; + +def SReg_1 : SIRegisterClassLike<0, false, false, true>, + RegClassByHwMode< + [DefaultMode_Wave64, + AlignedVGPRNoAGPRMode_Wave64, + AVAlign2LoadStoreMode, + DefaultMode_Wave32, + AlignedVGPRNoAGPRMode_Wave32], + [SReg_64, + SReg_64, + SReg_64, + SReg_32, + SReg_32] +>; + //===----------------------------------------------------------------------===// // // AlignTarget classes. Artifical classes to swap between @@ -1212,17 +1226,36 @@ defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_102 // //===----------------------------------------------------------------------===// +// We have 3 orthogonal properties to consider. Unfortunately we need +// to define the cross product of these states, minus unused +// combinations. + def AV_LdSt_32_Target : RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], - [VGPR_32, AV_32, VGPR_32]>, SIRegisterClassLike<32, true, true> { + [DefaultMode_Wave64, + DefaultMode_Wave32, + AVAlign2LoadStoreMode, + AlignedVGPRNoAGPRMode_Wave64, + AlignedVGPRNoAGPRMode_Wave32], + [VGPR_32, + VGPR_32, + AV_32, + VGPR_32, + VGPR_32]>, + SIRegisterClassLike<32, true, true> { let DecoderMethod = "decodeAVLdSt"; } foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 1024 ] in { def VReg_#RegSize#_AlignTarget : SIRegisterClassLike, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave64, + DefaultMode_Wave32, + AVAlign2LoadStoreMode, + AlignedVGPRNoAGPRMode_Wave64, + AlignedVGPRNoAGPRMode_Wave32], [!cast("VReg_"#RegSize), + !cast("VReg_"#RegSize), + !cast("VReg_"#RegSize#_Align2), !cast("VReg_"#RegSize#_Align2), !cast("VReg_"#RegSize#_Align2)]> { let DecoderMethod = "DecodeVReg_"#RegSize#"RegisterClass"; @@ -1230,45 +1263,59 @@ foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 10 def AReg_#RegSize#_AlignTarget : SIRegisterClassLike, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, /*Unused combination*/], + [DefaultMode_Wave64, /*unused combination*/ AVAlign2LoadStoreMode, /*Unused combination*/ /*Unused combination*/], [!cast("AReg_"#RegSize), + /*unused combination*/ !cast("AReg_"#RegSize#_Align2) + /*Unused combination*/ /*Unused combination*/]> { let DecoderMethod = "DecodeAReg_"#RegSize#"RegisterClass"; } def AV_#RegSize#_AlignTarget : SIRegisterClassLike, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave32, + DefaultMode_Wave64, + AVAlign2LoadStoreMode, + AlignedVGPRNoAGPRMode_Wave64, + AlignedVGPRNoAGPRMode_Wave32], [!cast("AV_"#RegSize), + !cast("AV_"#RegSize), !cast("AV_"#RegSize#_Align2), + !cast("VReg_"#RegSize#_Align2), !cast("VReg_"#RegSize#_Align2)]> { let DecoderMethod = "DecodeAV_"#RegSize#"RegisterClass"; } def AV_LdSt_#RegSize#_AlignTarget : SIRegisterClassLike, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], [!cast("VReg_"#RegSize), + !cast("VReg_"#RegSize), !cast("AV_"#RegSize#_Align2), + !cast("VReg_"#RegSize#_Align2), !cast("VReg_"#RegSize#_Align2)]> { let DecoderMethod = "decodeAVLdSt"; } def AV_LdSt_#RegSize#_Align2 : SIRegisterClassLike, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], [!cast("VReg_"#RegSize#_Align2), + !cast("VReg_"#RegSize#_Align2), !cast("AV_"#RegSize#_Align2), + !cast("VReg_"#RegSize#_Align2), !cast("VReg_"#RegSize#_Align2)]> { let DecoderMethod = "decodeAVLdSt"; } def AV_LdSt_#RegSize#_Align1 : SIRegisterClassLike, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], [!cast("VReg_"#RegSize), + !cast("VReg_"#RegSize), !cast("AV_"#RegSize), + !cast("VReg_"#RegSize), !cast("VReg_"#RegSize)]> { let DecoderMethod = "decodeAVLdSt"; } @@ -1276,8 +1323,8 @@ foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 10 def VS_64_AlignTarget : SIRegisterClassLike<64, true, false, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], - [VS_64, VS_64_Align2, VS_64_Align2]> { + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], + [VS_64, VS_64, VS_64_Align2, VS_64_Align2, VS_64_Align2]> { let DecoderMethod = "decodeSrcRegOrImm9"; } diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 2730ec52294e9..a829b807f33e8 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -1233,18 +1233,12 @@ defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">; // We need to use COPY_TO_REGCLASS to w/a the problem when ReplaceAllUsesWith() // complaints it cannot replace i1 <-> i64/i32 if node was not morphed in place. multiclass ICMP_Pattern { - let WaveSizePredicate = isWave64 in def : GCNPat < - (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), - (i64 (COPY_TO_REGCLASS dstInst, SReg_64)) + (WaveSizeVT (AMDGPUsetcc vt:$src0, vt:$src1, cond)), + dstInst >; let WaveSizePredicate = isWave32 in { - def : GCNPat < - (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), - (i32 (COPY_TO_REGCLASS dstInst, SReg_32)) - >; - // Support codegen of i64 setcc in wave32 mode. def : GCNPat < (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 6df91a1d438b0..ea3edb8ca6662 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -2208,12 +2208,12 @@ include "VOP3PInstructions.td" include "VOPDInstructions.td" class ClassPat : GCNPat < - (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask)), + (i1 (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask))), (inst i32:$src0_mods, vt:$src0, (V_MOV_B32_e32 timm:$mask)) >; class ClassPat_t16 : GCNPat < - (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask)), + (i1 (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask))), (inst i32:$src0_mods, vt:$src0, SRCMODS.NONE, (V_MOV_B32_e32 timm:$mask)) >; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll index e5cd0710359ac..70ff92f8eda92 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll @@ -66,7 +66,7 @@ define amdgpu_kernel void @asm_simple_agpr_clobber() { define i32 @asm_vgpr_early_clobber() { ; CHECK-LABEL: name: asm_vgpr_early_clobber ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %8, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %9, !1 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %8, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %9, !1 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]] @@ -94,7 +94,7 @@ entry: define i32 @test_single_vgpr_output() nounwind { ; CHECK-LABEL: name: test_single_vgpr_output ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %8 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -106,7 +106,7 @@ entry: define i32 @test_single_sgpr_output_s32() nounwind { ; CHECK-LABEL: name: test_single_sgpr_output_s32 ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %8 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %8 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -119,7 +119,7 @@ entry: define float @test_multiple_register_outputs_same() #0 { ; CHECK-LABEL: name: test_multiple_register_outputs_same ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 1835018 /* regdef:VGPR_32 */, def %9 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %8, 1245194 /* regdef:VGPR_32 */, def %9 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]] @@ -136,7 +136,7 @@ define float @test_multiple_register_outputs_same() #0 { define double @test_multiple_register_outputs_mixed() #0 { ; CHECK-LABEL: name: test_multiple_register_outputs_mixed ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 3407882 /* regdef:VReg_64 */, def %9 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %8, 2818058 /* regdef:VReg_64 */, def %9 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY %9 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) @@ -171,7 +171,7 @@ define amdgpu_kernel void @test_input_vgpr_imm() { ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[C]](s32) - ; CHECK-NEXT: INLINEASM &"v_mov_b32 v0, $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"v_mov_b32 v0, $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY1]] ; CHECK-NEXT: S_ENDPGM 0 call void asm sideeffect "v_mov_b32 v0, $0", "v"(i32 42) ret void @@ -185,7 +185,7 @@ define amdgpu_kernel void @test_input_sgpr_imm() { ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[C]](s32) - ; CHECK-NEXT: INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[COPY1]] ; CHECK-NEXT: S_ENDPGM 0 call void asm sideeffect "s_mov_b32 s0, $0", "s"(i32 42) ret void @@ -212,7 +212,7 @@ define float @test_input_vgpr(i32 %src) nounwind { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) - ; CHECK-NEXT: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 1835017 /* reguse:VGPR_32 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %9, 1245193 /* reguse:VGPR_32 */, [[COPY1]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -227,7 +227,7 @@ define i32 @test_memory_constraint(ptr addrspace(3) %a) nounwind { ; CHECK-NEXT: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CHECK-NEXT: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 262158 /* mem:m */, [[COPY]](p3) + ; CHECK-NEXT: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1245194 /* regdef:VGPR_32 */, def %9, 262158 /* mem:m */, [[COPY]](p3) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -244,7 +244,7 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind { ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AND]](s32) - ; CHECK-NEXT: INLINEASM &";", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3) + ; CHECK-NEXT: INLINEASM &";", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %11 ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -256,13 +256,13 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind { define i32 @test_sgpr_matching_constraint() nounwind { ; CHECK-LABEL: name: test_sgpr_matching_constraint ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %8 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %8 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %10 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %10 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %10 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]](s32) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY1]](s32) - ; CHECK-NEXT: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %12, 2424841 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %12, 1835017 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY %12 ; CHECK-NEXT: $vgpr0 = COPY [[COPY4]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -285,7 +285,7 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind { ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32) - ; CHECK-NEXT: INLINEASM &"; ", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 1835018 /* regdef:VGPR_32 */, def %12, 1835018 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5) + ; CHECK-NEXT: INLINEASM &"; ", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %11, 1245194 /* regdef:VGPR_32 */, def %12, 1245194 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5) ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY %11 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY %12 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY %13 @@ -306,10 +306,10 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind { define i32 @test_sgpr_to_vgpr_move_matching_constraint() nounwind { ; CHECK-LABEL: name: test_sgpr_to_vgpr_move_matching_constraint ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %8 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %8 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %10 ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir index 137488f24a331..7ca3869b535e4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir @@ -24,7 +24,7 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %5(s32) + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %5(s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[FMUL]], %5, [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) @@ -33,7 +33,7 @@ body: | %2:vgpr(s32) = COPY %1(s32) %3:vgpr(s32) = G_FMUL %0, %2 %4:sgpr(s32) = G_FCONSTANT float 1.000000e+00 - INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %5:vgpr_32 + INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %5:vgpr_32 %6:vgpr(s32) = COPY %4(s32) %7:vgpr(s32) = nnan G_AMDGPU_FMED3 %3(s32), %5(s32), %6(s32) $vgpr0 = COPY %7(s32) diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir index 8a39d9c517b50..34c0159dd3ddb 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir @@ -68,7 +68,7 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.7(0x7c000000) ; CHECK-NEXT: liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2424842 /* regdef:SReg_32 */, def renamable $sgpr4 + ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 1835018 /* regdef:SReg_32 */, def renamable $sgpr4 ; CHECK-NEXT: S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.3, implicit killed $scc ; CHECK-NEXT: {{ $}} @@ -149,7 +149,7 @@ body: | successors: %bb.3(0x04000000), %bb.2(0x7c000000) liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 - INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2424842 /* regdef:SReg_32 */, def renamable $sgpr4 + INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 1835018 /* regdef:SReg_32 */, def renamable $sgpr4 S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc S_CBRANCH_SCC1 %bb.2, implicit killed $scc diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir index a2f02052cbf36..4cf92b0127131 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir @@ -69,7 +69,7 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.7(0x7c000000) ; CHECK-NEXT: liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2424842 /* regdef:SReg_32 */, def renamable $sgpr4 + ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 1835018 /* regdef:SReg_32 */, def renamable $sgpr4 ; CHECK-NEXT: S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.3, implicit killed $scc ; CHECK-NEXT: {{ $}} @@ -151,7 +151,7 @@ body: | successors: %bb.3(0x04000000), %bb.2(0x7c000000) liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 - INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2424842 /* regdef:SReg_32 */, def renamable $sgpr4 + INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 1835018 /* regdef:SReg_32 */, def renamable $sgpr4 S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc S_CBRANCH_SCC1 %bb.2, implicit killed $scc diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir index 03f1018c40b21..9e1444d9213e7 100644 --- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir +++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir @@ -20,13 +20,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 undef %2.sub0:areg_64 = COPY %0 %2.sub1:areg_64 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2 SI_RETURN ... @@ -45,13 +45,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 undef %2.sub0:areg_64_align2 = COPY %0 %2.sub1:areg_64_align2 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -72,7 +72,7 @@ body: | ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96 = COPY [[COPY1]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96 = COPY [[COPY2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY3]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -80,7 +80,7 @@ body: | undef %3.sub0:areg_96 = COPY %0 %3.sub1:areg_96 = COPY %1 %3.sub2:areg_96 = COPY %2 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %3 SI_RETURN ... @@ -101,7 +101,7 @@ body: | ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY1]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY3]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -109,7 +109,7 @@ body: | undef %3.sub0:areg_96_align2 = COPY %0 %3.sub1:areg_96_align2 = COPY %1 %3.sub2:areg_96_align2 = COPY %2 - INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %3 SI_RETURN ... @@ -128,13 +128,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vreg_64 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128 = COPY %0 %2.sub2_sub3:areg_128 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -153,13 +153,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vreg_64 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128_align2 = COPY %0 %2.sub2_sub3:areg_128_align2 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... @@ -178,13 +178,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr9 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:sgpr_32 = COPY $sgpr8 %1:sgpr_32 = COPY $sgpr9 undef %2.sub0:areg_64_align2 = COPY %0 %2.sub1:areg_64_align2 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -203,13 +203,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vreg_64 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96 = COPY %0 %2.sub1_sub2:areg_96 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -228,13 +228,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vreg_64 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96_align2 = COPY %0 %2.sub1_sub2:areg_96_align2 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -253,13 +253,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vgpr_32 = COPY $vgpr2 undef %2.sub0_sub1:areg_96 = COPY %0 %2.sub2:areg_96 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -278,13 +278,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vgpr_32 = COPY $vgpr2 undef %2.sub0_sub1:areg_96_align2 = COPY %0 %2.sub2:areg_96_align2 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -302,12 +302,12 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %2.sub0:areg_64 = COPY %0 %2.sub1:areg_64 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2 SI_RETURN ... @@ -326,13 +326,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 undef %2.sub0:areg_64_align2 = COPY %0 %2.sub1:areg_64_align2 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -350,12 +350,12 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_96 = COPY %0 %1.sub1:areg_96 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %1 SI_RETURN ... @@ -373,12 +373,12 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_96_align2 = COPY %0 %1.sub1:areg_96_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %1 SI_RETURN ... @@ -398,14 +398,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_128 = COPY %0 %1.sub1:areg_128 = COPY %0 %1.sub2:areg_128 = COPY %0 %1.sub3:areg_128 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %1 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %1 SI_RETURN ... @@ -425,14 +425,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_128_align2 = COPY %0 %1.sub1:areg_128_align2 = COPY %0 %1.sub2:areg_128_align2 = COPY %0 %1.sub3:areg_128_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1 SI_RETURN ... @@ -451,15 +451,15 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 undef %2.sub0:areg_64 = COPY %0 %2.sub1:areg_64 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2 - INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, killed %0 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 1245193 /* reguse:VGPR_32 */, killed %0 SI_RETURN ... @@ -477,14 +477,14 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_64 = COPY %0 %1.sub1:areg_64 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %1 - INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, killed %0 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %1 + INLINEASM &"; use $0", 0 /* attdialect */, 1245193 /* reguse:VGPR_32 */, killed %0 SI_RETURN ... @@ -503,16 +503,16 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:VReg_64 */, [[COPY]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 2818057 /* reguse:VReg_64 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_64 = COPY %0 %1.sub1:areg_64 = COPY %0 undef %2.sub0:vreg_64 = COPY %0 %2.sub1:vreg_64 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %1 - INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:VReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %1 + INLINEASM &"; use $0", 0 /* attdialect */, 2818057 /* reguse:VReg_64 */, killed %2 SI_RETURN ... @@ -533,13 +533,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 %0.sub1:vreg_64 = COPY $vgpr1 undef %2.sub0:areg_64 = COPY %0.sub0 %2.sub1:areg_64 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2 SI_RETURN ... @@ -558,13 +558,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 %0.sub1:vreg_64 = COPY $vgpr1 undef %2.sub0:areg_64_align2 = COPY %0.sub0 %2.sub1:areg_64_align2 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -585,7 +585,7 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 =COPY $vgpr0 %0.sub1:vreg_96 = COPY $vgpr1 @@ -593,7 +593,7 @@ body: | undef %3.sub0:areg_96 = COPY %0.sub0 %3.sub1:areg_96 = COPY %0.sub1 %3.sub2:areg_96 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %3 SI_RETURN ... @@ -614,7 +614,7 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 =COPY $vgpr0 %0.sub1:vreg_96 = COPY $vgpr1 @@ -622,7 +622,7 @@ body: | undef %3.sub0:areg_96_align2 = COPY %0.sub0 %3.sub1:areg_96_align2 = COPY %0.sub1 %3.sub2:areg_96_align2 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %3 SI_RETURN ... @@ -641,13 +641,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1 %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1 %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3 - INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -668,13 +668,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_128 =COPY $vgpr0_vgpr1 %0.sub1:vreg_128 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0 %2.sub2_sub3:areg_128_align2 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... @@ -693,13 +693,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:sreg_64 = COPY $sgpr8 %0.sub1:sreg_64 = COPY $sgpr9 undef %2.sub0:areg_64_align2 = COPY %0.sub0 %2.sub1:areg_64_align2 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -718,13 +718,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 =COPY $vgpr0 %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96 = COPY %0.sub0 %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -743,13 +743,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 =COPY $vgpr0 %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96_align2 = COPY %0.sub0 %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -768,13 +768,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1 %0.sub2:vreg_96 = COPY $vgpr2 undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1 %2.sub2:areg_96 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -793,13 +793,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1 %0.sub2:vreg_96 = COPY $vgpr2 undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1 %2.sub2:areg_96_align2 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -817,12 +817,12 @@ body: | ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %2.sub0:areg_64 = COPY %0.sub0 %2.sub1:areg_64 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2 SI_RETURN ... @@ -841,13 +841,13 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %1.sub0:areg_96 = COPY %0.sub0 %1.sub1:areg_96 = COPY %0.sub0 %1.sub2:areg_96 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %1 SI_RETURN ... @@ -865,12 +865,12 @@ body: | ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %1.sub0:areg_96_align2 = COPY %0.sub0 %1.sub1:areg_96_align2 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %1 SI_RETURN ... @@ -890,14 +890,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %1.sub0:areg_128 = COPY %0.sub0 %1.sub1:areg_128 = COPY %0.sub0 %1.sub2:areg_128 = COPY %0.sub0 %1.sub3:areg_128 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %1 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %1 SI_RETURN ... @@ -917,14 +917,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %1.sub0:areg_128_align2 = COPY %0.sub0 %1.sub1:areg_128_align2 = COPY %0.sub0 %1.sub2:areg_128_align2 = COPY %0.sub0 %1.sub3:areg_128_align2 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1 SI_RETURN ... @@ -943,13 +943,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 %0.sub1:vreg_64 = COPY $vgpr1 undef %2.sub0:areg_64 = COPY %0.sub0 %2.sub1:areg_64 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2 SI_RETURN ... @@ -968,13 +968,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64_align2 = COPY $vgpr0 %0.sub1:vreg_64_align2 = COPY $vgpr1 undef %2.sub0:areg_64_align2 = COPY %0.sub0 %2.sub1:areg_64_align2 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -995,7 +995,7 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 = COPY $vgpr0 %0.sub1:vreg_96 = COPY $vgpr1 @@ -1003,7 +1003,7 @@ body: | undef %3.sub0:areg_96 = COPY %0.sub0 %3.sub1:areg_96 = COPY %0.sub1 %3.sub2:areg_96 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %3 SI_RETURN ... @@ -1024,7 +1024,7 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96_align2 = COPY $vgpr0 %0.sub1:vreg_96_align2 = COPY $vgpr1 @@ -1032,7 +1032,7 @@ body: | undef %3.sub0:areg_96_align2 = COPY %0.sub0 %3.sub1:areg_96_align2 = COPY %0.sub1 %3.sub2:areg_96_align2 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %3 SI_RETURN ... @@ -1051,13 +1051,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1 %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1 %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3 - INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -1076,13 +1076,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1 %0.sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1 %2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3 - INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... @@ -1101,13 +1101,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:sreg_64 = COPY $sgpr8 %0.sub1:sreg_64 = COPY $sgpr9 undef %2.sub0:areg_64_align2 = COPY %0.sub0 %2.sub1:areg_64_align2 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -1126,13 +1126,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 = COPY $vgpr0 %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96 = COPY %0.sub0 %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -1150,13 +1150,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 = COPY $vgpr0 %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96 = COPY %0.sub2 %2.sub1_sub2:areg_96 = COPY %0.sub0_sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -1176,13 +1176,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96_align2 = COPY $vgpr0 %0.sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96_align2 = COPY %0.sub0 %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -1201,13 +1201,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1 %0.sub2:vreg_96 = COPY $vgpr2 undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1 %2.sub2:areg_96 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -1226,13 +1226,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96_align2 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_96_align2 = COPY $vgpr0_vgpr1 %0.sub2:vreg_96_align2 = COPY $vgpr2 undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1 %2.sub2:areg_96_align2 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -1251,13 +1251,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1 %0.sub2:vreg_96 = COPY $vgpr2 undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1 %2.sub2:areg_96_align2 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -1274,11 +1274,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %2:areg_64 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2 SI_RETURN ... @@ -1295,11 +1295,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %2:areg_64_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -1316,11 +1316,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 %3:areg_96 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %3 SI_RETURN ... @@ -1337,11 +1337,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2 %3:areg_96_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %3 SI_RETURN ... @@ -1358,11 +1358,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %2:areg_128 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -1379,11 +1379,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %2:areg_128_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... @@ -1400,11 +1400,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:sreg_64 = COPY $sgpr8_sgpr9 %2:areg_64_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -1421,11 +1421,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 %2:areg_96_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir index 126cbc643accf..856d1e66fee9d 100644 --- a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir @@ -20,10 +20,10 @@ body: | ; CHECK-LABEL: name: foo1 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %2.sub0, 1245195 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %0:vgpr_32, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %0 %2.sub1:vreg_64 = COPY killed %1 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) @@ -41,10 +41,10 @@ body: | ; CHECK-LABEL: name: foo2 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VGPR_32 */, def undef %2.sub0 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1245195 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 1245194 /* regdef:VGPR_32 */, def undef %2.sub0 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 1245194 /* regdef:VGPR_32 */, def %0:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %0 %2.sub1:vreg_64 = COPY killed %1 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) @@ -62,10 +62,10 @@ body: | ; CHECK-LABEL: name: foo3 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %2.sub0, 1245195 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %1:vgpr_32, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %1 %2.sub1:vreg_64 = COPY killed %0 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) @@ -83,10 +83,10 @@ body: | ; CHECK-LABEL: name: foo4 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VGPR_32 */, def undef %2.sub0 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1245195 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 1245194 /* regdef:VGPR_32 */, def undef %2.sub0 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 1245194 /* regdef:VGPR_32 */, def %1:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %1 %2.sub1:vreg_64 = COPY killed %0 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir index 4dfdb56a69ff3..98472552d2bf1 100644 --- a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir @@ -370,7 +370,7 @@ body: | ; HAZARD-LABEL: name: inline_sdwa_hazard ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; HAZARD-NEXT: {{ $}} - ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 ; HAZARD-NEXT: S_NOP 0 ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) ; HAZARD-NEXT: S_ENDPGM 0 @@ -378,10 +378,10 @@ body: | ; NOHAZARD-LABEL: name: inline_sdwa_hazard ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; NOHAZARD-NEXT: {{ $}} - ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) ; NOHAZARD-NEXT: S_ENDPGM 0 - INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) S_ENDPGM 0 ... @@ -397,17 +397,17 @@ body: | ; HAZARD-NEXT: {{ $}} ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) ; HAZARD-NEXT: S_NOP 0 - ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 ; HAZARD-NEXT: S_ENDPGM 0 ; ; NOHAZARD-LABEL: name: sdwa_inline_hazard ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; NOHAZARD-NEXT: {{ $}} ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) - ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 ; NOHAZARD-NEXT: S_ENDPGM 0 renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) - INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 S_ENDPGM 0 ... @@ -421,19 +421,19 @@ body: | ; HAZARD-LABEL: name: inline_inline_hazard ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; HAZARD-NEXT: {{ $}} - ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 ; HAZARD-NEXT: S_NOP 0 - ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 ; HAZARD-NEXT: S_ENDPGM 0 ; ; NOHAZARD-LABEL: name: inline_inline_hazard ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; NOHAZARD-NEXT: {{ $}} - ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 - ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 + ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 ; NOHAZARD-NEXT: S_ENDPGM 0 - INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 - INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 + INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir index 6b9b77c228350..437b4e8b9b493 100644 --- a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir +++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir @@ -1112,11 +1112,11 @@ body: | ; GCN-NEXT: S_WAITCNT 0 ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, killed renamable $vgpr2 + ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, killed renamable $vgpr2 ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31 S_WAITCNT 0 renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, killed renamable $vgpr2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, killed renamable $vgpr2 S_SETPC_B64_return undef $sgpr30_sgpr31 ... diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir index 1445f6b7b58be..382a8d38fd652 100644 --- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir +++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir @@ -486,7 +486,7 @@ body: | ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; CHECK-NEXT: INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK-NEXT: INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_ENDPGM 0 bb.0: S_NOP 0, implicit-def $agpr0 @@ -516,7 +516,7 @@ body: | S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 - INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2 + INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2 S_ENDPGM 0 ... @@ -1368,7 +1368,7 @@ body: | ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) ; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} @@ -1408,7 +1408,7 @@ body: | undef %2.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1) early-clobber %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %2, 0, 0, 0, implicit $mode, implicit $exec early-clobber %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, %4 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, %4 S_CBRANCH_VCCNZ %bb.1, implicit $vcc S_BRANCH %bb.2 @@ -1726,7 +1726,7 @@ body: | ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} @@ -1763,7 +1763,7 @@ body: | undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1) %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, %4 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, %4 S_CBRANCH_VCCNZ %bb.1, implicit $vcc S_BRANCH %bb.2 diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll index 82b9458d09c80..9e1d59064cb5e 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,16 +8,16 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10354698 /* regdef:SGPR_128 */, def %13 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9764874 /* regdef:SGPR_128 */, def %13 ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %13 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10354697 /* reguse:SGPR_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9764873 /* reguse:SGPR_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10354698 /* regdef:SGPR_128 */, def %11 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9764874 /* regdef:SGPR_128 */, def %11 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10354697 /* reguse:SGPR_128 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9764873 /* reguse:SGPR_128 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() call void asm sideeffect "; use $0", "s"(i128 %val) @@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() { define amdgpu_kernel void @v_input_output_i128() { ; GFX908-LABEL: name: v_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7602186 /* regdef:VReg_128 */, def %13 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7012362 /* regdef:VReg_128 */, def %13 ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %13 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7602185 /* reguse:VReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7012361 /* reguse:VReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: v_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128_Align2 */, def %11 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:VReg_128_Align2 */, def %11 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %11 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7929865 /* reguse:VReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:VReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() call void asm sideeffect "; use $0", "v"(i128 %val) @@ -47,16 +47,16 @@ define amdgpu_kernel void @a_input_output_i128() { ; GFX908-LABEL: name: a_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8519690 /* regdef:AReg_128 */, def %13 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:AReg_128 */, def %13 ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %13 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: a_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9240586 /* regdef:AReg_128_Align2 */, def %11 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8650762 /* regdef:AReg_128_Align2 */, def %11 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %11 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() call void asm sideeffect "; use $0", "a"(i128 %val) diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir index e6a52342337f3..8ea9ec397fe06 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir @@ -18,21 +18,21 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], 256, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 256, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 512, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 256, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e64 %stack.0, 512, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -53,27 +53,27 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, [[V_ADD_U32_e64_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX10-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], -156, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, %stack.0, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 512, %stack.0, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX12-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 100, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_U32_e64 256, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e64 512, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 %2:vgpr_32 = V_ADD_U32_e64 %stack.0, 100, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %2 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir index 98b7f4a5aa1c5..71c47c80ae357 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir @@ -21,9 +21,9 @@ body: | ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]] - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_CO_U32_e64_]], implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets @@ -31,9 +31,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets @@ -41,10 +41,10 @@ body: | ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0 ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[COPY1]], implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets @@ -52,9 +52,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets @@ -62,15 +62,15 @@ body: | ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0 ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[COPY1]], implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -88,42 +88,42 @@ body: | bb.0: ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc ; GFX803: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc ; GFX900: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc ; GFX942: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc ; GFX10: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc ; GFX12: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1, implicit $vcc + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1, implicit $vcc SI_RETURN ... @@ -144,9 +144,9 @@ body: | ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]] - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_CO_U32_e64_]], implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets @@ -154,9 +154,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets @@ -164,10 +164,10 @@ body: | ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0 ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[COPY1]], implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets @@ -175,9 +175,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets @@ -185,15 +185,15 @@ body: | ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0 ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[COPY1]], implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_CO_U32_e32 8, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_CO_U32_e32 16, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -214,9 +214,9 @@ body: | ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]] - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_CO_U32_e64_]], 0, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets @@ -224,9 +224,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets @@ -234,9 +234,9 @@ body: | ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0 ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX942-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[S_ADD_I32_]], 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets @@ -244,9 +244,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets @@ -254,14 +254,14 @@ body: | ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0 ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[S_ADD_I32_]], 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -279,42 +279,42 @@ body: | bb.0: ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc ; GFX803: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX803-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]] ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc ; GFX900: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX900-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]] ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc ; GFX942: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX942-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]] ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX10-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]] ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc ; GFX12: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX12-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]] %0:vgpr_32, %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32, %3:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN implicit %2 ... @@ -332,42 +332,42 @@ body: | bb.0: ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets ; GFX942: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX12-NEXT: SI_RETURN %0:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %0 %1:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %1 SI_RETURN ... @@ -385,42 +385,42 @@ body: | bb.0: ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets ; GFX942: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX12-NEXT: SI_RETURN %0:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %0 %1:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %1 SI_RETURN ... @@ -443,9 +443,9 @@ body: | ; GFX803-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX803-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX803-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets @@ -454,9 +454,9 @@ body: | ; GFX900-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX900-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX900-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets @@ -465,9 +465,9 @@ body: | ; GFX942-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets @@ -476,9 +476,9 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX10-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets @@ -487,17 +487,17 @@ body: | ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX12-NEXT: SI_RETURN %0:sreg_32 = COPY $sgpr4 %1:sreg_32 = COPY $sgpr5 %2:sreg_32 = S_ADD_I32 %0, %stack.0, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %2 %3:sreg_32 = S_ADD_I32 %1, %stack.0, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %3 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %3 SI_RETURN ... @@ -520,9 +520,9 @@ body: | ; GFX803-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX803-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX803-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute @@ -531,9 +531,9 @@ body: | ; GFX900-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX900-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX900-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute @@ -542,9 +542,9 @@ body: | ; GFX942-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute @@ -553,9 +553,9 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX10-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute @@ -564,17 +564,17 @@ body: | ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX12-NEXT: SI_RETURN %0:sreg_32 = COPY $sgpr4 %1:sreg_32 = COPY $sgpr5 %2:sreg_32 = S_ADD_I32 %stack.0, %0, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %2 %3:sreg_32 = S_ADD_I32 %stack.0, %1, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %3 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %3 SI_RETURN ... @@ -592,48 +592,48 @@ body: | bb.0: ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX803-NEXT: S_NOP 0, implicit $scc ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX803-NEXT: SI_RETURN implicit $scc ; ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX900-NEXT: S_NOP 0, implicit $scc ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX900-NEXT: SI_RETURN implicit $scc ; ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc ; GFX942: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX942-NEXT: S_NOP 0, implicit $scc ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX942-NEXT: SI_RETURN implicit $scc ; ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX10-NEXT: S_NOP 0, implicit $scc ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX10-NEXT: SI_RETURN implicit $scc ; ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX12-NEXT: S_NOP 0, implicit $scc ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX12-NEXT: SI_RETURN implicit $scc %0:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %0 S_NOP 0, implicit $scc %1:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %1 SI_RETURN implicit $scc ... @@ -656,9 +656,9 @@ body: | ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX803-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets @@ -667,9 +667,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets @@ -678,9 +678,9 @@ body: | ; GFX942-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0 ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets @@ -689,9 +689,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets @@ -700,15 +700,15 @@ body: | ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0 ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %vgpr_offset:vgpr_32 = COPY $vgpr0 %0:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -731,9 +731,9 @@ body: | ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX803-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute @@ -742,9 +742,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute @@ -753,9 +753,9 @@ body: | ; GFX942-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0 ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute @@ -764,9 +764,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute @@ -775,15 +775,15 @@ body: | ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0 ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %vgpr_offset:vgpr_32 = COPY $vgpr0 %0:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, %vgpr_offset, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, %vgpr_offset, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -805,9 +805,9 @@ body: | ; GFX803-NEXT: {{ $}} ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets @@ -815,9 +815,9 @@ body: | ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets @@ -825,9 +825,9 @@ body: | ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets @@ -836,9 +836,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets @@ -848,16 +848,16 @@ body: | ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[COPY]], implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[COPY1]], implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %sgpr_offset:sreg_32 = COPY $sgpr8 %0:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -880,9 +880,9 @@ body: | ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets @@ -891,9 +891,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets @@ -903,10 +903,10 @@ body: | ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[COPY]], 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[COPY1]], 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets @@ -915,9 +915,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets @@ -926,15 +926,15 @@ body: | ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0 ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[S_MOV_B32_]], 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[S_MOV_B32_]], 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX12-NEXT: SI_RETURN %sgpr_offset:sreg_32 = COPY $sgpr8 %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -957,9 +957,9 @@ body: | ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute @@ -968,9 +968,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute @@ -980,10 +980,10 @@ body: | ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %sgpr_offset ; GFX942-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[COPY]], 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY %sgpr_offset ; GFX942-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[COPY1]], 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute @@ -992,9 +992,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute @@ -1003,15 +1003,15 @@ body: | ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0 ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], %sgpr_offset, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], %sgpr_offset, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX12-NEXT: SI_RETURN %sgpr_offset:sreg_32 = COPY $sgpr8 %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir index 19ca463d7ecbb..f0868ffeeb7c5 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir @@ -20,16 +20,16 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, [[V_ADD_U32_e64_]], implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets ; GFX942: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets @@ -37,21 +37,21 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, [[V_ADD_U32_e64_]], implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets ; GFX12: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -72,16 +72,16 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, [[V_ADD_U32_e64_]], implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets ; GFX942: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets @@ -89,21 +89,21 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, [[V_ADD_U32_e64_]], implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets ; GFX12: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -124,16 +124,16 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets ; GFX942: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets @@ -141,21 +141,21 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -178,9 +178,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets @@ -188,9 +188,9 @@ body: | ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX942-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets @@ -199,9 +199,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets @@ -209,15 +209,15 @@ body: | ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %vgpr_offset:vgpr_32 = COPY $vgpr0 %0:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -240,9 +240,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute @@ -250,9 +250,9 @@ body: | ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX942-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute @@ -261,9 +261,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute @@ -271,15 +271,15 @@ body: | ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %vgpr_offset:vgpr_32 = COPY $vgpr0 %0:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -301,9 +301,9 @@ body: | ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets @@ -311,9 +311,9 @@ body: | ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX942-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets @@ -322,9 +322,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets @@ -332,15 +332,15 @@ body: | ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %sgpr_offset:sreg_32 = COPY $sgpr8 %0:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -363,9 +363,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets @@ -373,9 +373,9 @@ body: | ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX942-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets @@ -384,9 +384,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets @@ -394,15 +394,15 @@ body: | ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX12-NEXT: SI_RETURN %sgpr_offset:sreg_32 = COPY $sgpr8 %0:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -425,9 +425,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute @@ -435,9 +435,9 @@ body: | ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX942-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute @@ -446,9 +446,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute @@ -456,15 +456,15 @@ body: | ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX12-NEXT: SI_RETURN %sgpr_offset:sreg_32 = COPY $sgpr8 %0:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -486,16 +486,16 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 1, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier ; GFX942: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 1, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 1, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier @@ -503,21 +503,21 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 1, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 1, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 1, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, /*clamp*/1, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e64 16, %stack.0, /*clamp*/1, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir index c117473581746..1e8a2d3ad9163 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir @@ -6429,7 +6429,7 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %22, 1835017 /* reguse:VGPR_32 */, [[V_CVT_I32_F64_e32_4]] + ; GFX908-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %22, 1245193 /* reguse:VGPR_32 */, [[V_CVT_I32_F64_e32_4]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) @@ -6478,7 +6478,7 @@ body: | %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %22:vgpr_32, 1835017 /* reguse:VGPR_32 */, %4 + INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %22:vgpr_32, 1245193 /* reguse:VGPR_32 */, %4 %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir index 8b2c5887c97ee..5095ad021fde3 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir @@ -33,7 +33,7 @@ name: asm_write_vgpr_accvgpr_write_read body: | bb.0: - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0 $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ... @@ -47,7 +47,7 @@ name: asm_write_vgpr_accvgpr_write_read_partialnop body: | bb.0: - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0 S_NOP 0 $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ... @@ -60,7 +60,7 @@ name: asm_write_vgpr_accvgpr_write_read_otherreg body: | bb.0: liveins: $vgpr0 - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr1 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr1 $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ... diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index 2aae26b9470a8..6381db7b69cd4 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -11,10 +11,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX908: bb.0 (%ir-block.0): ; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX908-NEXT: {{ $}} - ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef %6:agpr_32 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7602186 /* regdef:VReg_128 */, def %25 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, undef %6:agpr_32 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7012362 /* regdef:VReg_128 */, def %25 ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %25 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3407882 /* regdef:VReg_64 */, def %27 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2818058 /* regdef:VReg_64 */, def %27 ; REGALLOC-GFX908-NEXT: SI_SPILL_AV64_SAVE %27, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]] ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) @@ -36,10 +36,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX908-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3 ; PEI-GFX908-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; PEI-GFX908-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 - ; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef renamable $agpr0 - ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7602186 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 + ; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, undef renamable $agpr0 + ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7012362 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec - ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3407882 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1 + ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2818058 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1 ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) ; PEI-GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1 ; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec @@ -60,10 +60,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX90A: bb.0 (%ir-block.0): ; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX90A-NEXT: {{ $}} - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef %6:agpr_32 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128_Align2 */, def %23 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, undef %6:agpr_32 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:VReg_128_Align2 */, def %23 ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %23 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3670026 /* regdef:VReg_64_Align2 */, def %21 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64_Align2 */, def %21 ; REGALLOC-GFX90A-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY %21 ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) @@ -79,10 +79,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX90A: bb.0 (%ir-block.0): ; PEI-GFX90A-NEXT: liveins: $sgpr4_sgpr5 ; PEI-GFX90A-NEXT: {{ $}} - ; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef renamable $agpr0 - ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 + ; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, undef renamable $agpr0 + ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec - ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3670026 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3 + ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3 ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir index a97ef058ce5fa..db8908bcbac67 100644 --- a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir +++ b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir @@ -43,17 +43,17 @@ machineFunctionInfo: body: | bb.0: - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:AGPR_32 */, implicit-def $agpr0 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1638410 /* regdef:AGPR_32 */, implicit-def $agpr0 %14:vgpr_32 = COPY killed $agpr0 - INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 39714826 /* regdef:VReg_512 */, def %7, 19136522 /* regdef:VReg_256 */, def %8, 7602186 /* regdef:VReg_128 */, def %9, 5636106 /* regdef:VReg_96 */, def %10, 5636106 /* regdef:VReg_96 */, def %11 + INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 39125002 /* regdef:VReg_512 */, def %7, 18546698 /* regdef:VReg_256 */, def %8, 7012362 /* regdef:VReg_128 */, def %9, 5046282 /* regdef:VReg_96 */, def %10, 5046282 /* regdef:VReg_96 */, def %11 INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 12 /* clobber */, implicit-def dead early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39714825 /* reguse:VReg_512 */, %7 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 19136521 /* reguse:VReg_256 */, %8 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7602185 /* reguse:VReg_128 */, %9 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5636105 /* reguse:VReg_96 */, %10 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5636105 /* reguse:VReg_96 */, %11 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39125001 /* reguse:VReg_512 */, %7 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 18546697 /* reguse:VReg_256 */, %8 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7012361 /* reguse:VReg_128 */, %9 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5046281 /* reguse:VReg_96 */, %10 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5046281 /* reguse:VReg_96 */, %11 $agpr1 = COPY %14 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, killed $agpr1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, killed $agpr1 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir index 35be513c784bb..8ac85fa9c41a2 100644 --- a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir +++ b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir @@ -73,7 +73,7 @@ body: | # (1) %0.sub0 + %0.sub0 and (2) %0.sub1 + %0.sub1 # Check that renaming (2) does not inadvertently rename (1). # CHECK-LABEL: name: test2 -# CHECK: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %0.sub0, 1835018 /* regdef:VGPR_32 */, def dead %1.sub1, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %1.sub1(tied-def 5) +# CHECK: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %0.sub0, 1245194 /* regdef:VGPR_32 */, def dead %1.sub1, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %1.sub1(tied-def 5) name: test2 body: | bb.0: @@ -81,7 +81,7 @@ body: | bb.1: undef %0.sub1:vreg_64 = V_ALIGNBIT_B32_e64 %0.sub0:vreg_64, %0.sub0:vreg_64, 16, implicit $exec - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %0.sub0:vreg_64, 1835018 /* regdef:VGPR_32 */, def %0.sub1:vreg_64, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0:vreg_64(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %0.sub1:vreg_64(tied-def 5) + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %0.sub0:vreg_64, 1245194 /* regdef:VGPR_32 */, def %0.sub1:vreg_64, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0:vreg_64(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %0.sub1:vreg_64(tied-def 5) S_BRANCH %bb.1 ... diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir index 39f64185b9d57..cdd68630bf4ff 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir @@ -43,7 +43,7 @@ body: | ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]] ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]]:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_64_Align2 */, [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3080201 /* reguse:VReg_64_Align2 */, [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]] ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 %1:av_64_align2 = COPY $vgpr0_vgpr1 @@ -51,7 +51,7 @@ body: | %3:areg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1) %4:vreg_128_align2 = COPY %3 %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_64_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3080201 /* reguse:VReg_64_Align2 */, %5 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir index c764bc17f0631..d7b713aa53b86 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir @@ -19,7 +19,7 @@ body: | ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN @@ -30,7 +30,7 @@ body: | %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4 %5.sub2_sub3 = IMPLICIT_DEF - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN @@ -172,7 +172,7 @@ body: | ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3 ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN @@ -183,7 +183,7 @@ body: | undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4.sub2_sub3 %5.sub2_sub3 = IMPLICIT_DEF - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN @@ -208,7 +208,7 @@ body: | ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub2 ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN @@ -219,7 +219,7 @@ body: | undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub1:areg_128_align2 = COPY %4.sub2 %5.sub2_sub3 = IMPLICIT_DEF - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir index c0f5a7737afb0..3f61c3dbfaf37 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir @@ -17,7 +17,7 @@ body: | ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -26,7 +26,7 @@ body: | %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1) %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN ... @@ -47,7 +47,7 @@ body: | ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -56,7 +56,7 @@ body: | %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1) %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN ... @@ -79,7 +79,7 @@ body: | ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1 ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -90,7 +90,7 @@ body: | %other_use:vreg_64_align2 = COPY %4.sub0_sub1 %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec %6:areg_64_align2 = COPY %5 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %6:areg_64_align2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %6:areg_64_align2 GLOBAL_STORE_DWORDX2 %0, %6, 0, 0, implicit $exec :: (store (s64), addrspace 1) SI_RETURN ... @@ -114,7 +114,7 @@ body: | ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]].sub0_sub1:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -126,7 +126,7 @@ body: | undef %5.sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4, 0, 0, 0, implicit $mode, implicit $exec %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec %7:areg_64_align2 = COPY %6 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %7 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %7 GLOBAL_STORE_DWORDX2 %0, %7, 0, 0, implicit $exec :: (store (s64), addrspace 1) SI_RETURN @@ -151,7 +151,7 @@ body: | ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1 ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -163,7 +163,7 @@ body: | %other_use:vreg_64_align2 = COPY %5.sub0_sub1 %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec undef %8.sub0_sub1:areg_128_align2 = COPY %6 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %8:areg_128_align2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %8:areg_128_align2 GLOBAL_STORE_DWORDX4 %0, %8, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN @@ -189,7 +189,7 @@ body: | ; CHECK-NEXT: dead %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1 ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[COPY3:%[0-9]+]]:agpr_32 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s32), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -202,7 +202,7 @@ body: | %other_use1:vreg_64_align2 = COPY %5.sub0_sub1 %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec %8:agpr_32 = COPY %6 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, %8:agpr_32 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, %8:agpr_32 GLOBAL_STORE_DWORD %0, %8, 0, 0, implicit $exec :: (store (s32), addrspace 1) SI_RETURN @@ -231,7 +231,7 @@ body: | ; CHECK-NEXT: dead %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3 ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -245,7 +245,7 @@ body: | %other_use1:vreg_64_align2 = COPY %4.sub2_sub3 %other_use2:vreg_64 = COPY %4.sub1_sub2 %6:areg_128_align2 = COPY %4 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %6:areg_128_align2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %6:areg_128_align2 GLOBAL_STORE_DWORDX4 %0, %6, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN ... @@ -273,7 +273,7 @@ body: | ; CHECK-NEXT: %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3 ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -287,7 +287,7 @@ body: | %other_use1:vreg_64_align2 = COPY %4.sub2_sub3 %other_use2:vreg_64 = COPY %4.sub1_sub2 %6:areg_64 = COPY %4.sub1_sub2 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64 */, %6:areg_64 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3407881 /* reguse:AReg_64 */, %6:areg_64 GLOBAL_STORE_DWORDX2 %0, %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1) SI_RETURN ... @@ -313,7 +313,7 @@ body: | ; CHECK-NEXT: %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3 ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -327,7 +327,7 @@ body: | %other_use1:vreg_64_align2 = COPY %4.sub2_sub3 %other_use2:vreg_64 = COPY %4.sub1_sub2 %6:areg_64 = COPY %4.sub1_sub2 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64 */, %6:areg_64 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3407881 /* reguse:AReg_64 */, %6:areg_64 GLOBAL_STORE_DWORDX2 %0, %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1) SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir index af882c06e1b4e..b754a6b897159 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir @@ -37,7 +37,7 @@ body: | ; CHECK-NEXT: dead [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[V_ADD_U32_e32_]], 0, 0, implicit $exec ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_512 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead [[COPY1]], 1835018 /* regdef:VGPR_32 */, def dead [[COPY]].sub1, 1835017 /* reguse:VGPR_32 */, [[COPY1]], 1835017 /* reguse:VGPR_32 */, [[COPY]].sub1 + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead [[COPY1]], 1245194 /* regdef:VGPR_32 */, def dead [[COPY]].sub1, 1245193 /* reguse:VGPR_32 */, [[COPY1]], 1245193 /* reguse:VGPR_32 */, [[COPY]].sub1 ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub0:vreg_512 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub3:vreg_512 = COPY [[COPY]].sub3 ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:vreg_512 = COPY undef [[V_MOV_B32_e32_1]] @@ -63,7 +63,7 @@ body: | undef %11.sub0:vreg_512 = COPY %4.sub0 %12:vgpr_32 = COPY %4.sub0 %11.sub1:vreg_512 = COPY %4.sub1 - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead %12:vgpr_32, 1835018 /* regdef:VGPR_32 */, def dead %4.sub1:vreg_512, 1835017 /* reguse:VGPR_32 */, %12:vgpr_32, 1835017 /* reguse:VGPR_32 */, %4.sub1:vreg_512 + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead %12:vgpr_32, 1245194 /* regdef:VGPR_32 */, def dead %4.sub1:vreg_512, 1245193 /* reguse:VGPR_32 */, %12:vgpr_32, 1245193 /* reguse:VGPR_32 */, %4.sub1:vreg_512 %11.sub2:vreg_512 = COPY undef %1 %11.sub3:vreg_512 = COPY %4.sub3 %11.sub5:vreg_512 = COPY undef %1 diff --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir index 3e8e1878e0be5..5edb9669d98eb 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir @@ -40,18 +40,18 @@ body: | ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead %11 + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead %11 ; CHECK-NEXT: GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, implicit $exec :: (store (s32), addrspace 1) ; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 3) - ; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %15, 1835018 /* regdef:VGPR_32 */, def %16 + ; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %15, 1245194 /* regdef:VGPR_32 */, def %16 ; CHECK-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec ; CHECK-NEXT: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_2]], 0, 0, implicit $exec ; CHECK-NEXT: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec - ; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %21, 1835018 /* regdef:VGPR_32 */, def %22 + ; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %21, 1245194 /* regdef:VGPR_32 */, def %22 ; CHECK-NEXT: [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_2]], 0, 0, implicit $exec - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_3]], 1835018 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_4]], 1835017 /* reguse:VGPR_32 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_3]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_4]](tied-def 5), 1835017 /* reguse:VGPR_32 */, %15, 1835017 /* reguse:VGPR_32 */, %16, 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_1]], 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]], 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_3]], 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_2]] + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_3]], 1245194 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_4]], 1245193 /* reguse:VGPR_32 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_3]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_4]](tied-def 5), 1245193 /* reguse:VGPR_32 */, %15, 1245193 /* reguse:VGPR_32 */, %16, 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_1]], 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]], 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_3]], 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_2]] ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_1]] ; CHECK-NEXT: DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3) ; CHECK-NEXT: DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3) @@ -94,21 +94,21 @@ body: | %10:vgpr_32 = IMPLICIT_DEF bb.1: - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11:vgpr_32 + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %11:vgpr_32 GLOBAL_STORE_DWORD undef %12:vreg_64, %1, 0, 0, implicit $exec :: (store (s32), addrspace 1) %13:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 3) - INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %15:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %16:vgpr_32 + INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %15:vgpr_32, 1245194 /* regdef:VGPR_32 */, def %16:vgpr_32 %17:vgpr_32 = DS_READ_B32_gfx9 %6, 0, 0, implicit $exec %18:vgpr_32 = DS_READ_B32_gfx9 %7, 0, 0, implicit $exec %19:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec - INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %21:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %22:vgpr_32 + INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %21:vgpr_32, 1245194 /* regdef:VGPR_32 */, def %22:vgpr_32 %23:vgpr_32 = DS_READ_B32_gfx9 %7, 0, 0, implicit $exec %24:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %5.sub1:vreg_64 = COPY %6 %25:vgpr_32 = V_ADD_U32_e32 1, %10, implicit $exec %26:sreg_64_xexec = V_CMP_GT_U32_e64 64, %25, implicit $exec %27:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead %24:vgpr_32, 1835018 /* regdef:VGPR_32 */, def dead %27:vgpr_32, 1835017 /* reguse:VGPR_32 */, %13.sub0:vreg_64, 2147483657 /* reguse tiedto:$0 */, %24:vgpr_32(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %27:vgpr_32(tied-def 5), 1835017 /* reguse:VGPR_32 */, %15, 1835017 /* reguse:VGPR_32 */, %16, 1835017 /* reguse:VGPR_32 */, %18, 1835017 /* reguse:VGPR_32 */, %17, 1835017 /* reguse:VGPR_32 */, %23, 1835017 /* reguse:VGPR_32 */, %19 + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead %24:vgpr_32, 1245194 /* regdef:VGPR_32 */, def dead %27:vgpr_32, 1245193 /* reguse:VGPR_32 */, %13.sub0:vreg_64, 2147483657 /* reguse tiedto:$0 */, %24:vgpr_32(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %27:vgpr_32(tied-def 5), 1245193 /* reguse:VGPR_32 */, %15, 1245193 /* reguse:VGPR_32 */, %16, 1245193 /* reguse:VGPR_32 */, %18, 1245193 /* reguse:VGPR_32 */, %17, 1245193 /* reguse:VGPR_32 */, %23, 1245193 /* reguse:VGPR_32 */, %19 DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3) DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3) DS_WRITE_B64_gfx9 undef %30:vgpr_32, %5, 0, 0, implicit $exec :: (store (s64), addrspace 3) diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll index b13829e0351f6..8ecc0ad65a944 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll @@ -12,10 +12,10 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 { ; GCN-NEXT: [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec ; GCN-NEXT: [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec ; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %14.sub0 + ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %14.sub0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %24:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) - ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3407881 /* reguse:VReg_64 */, %14 + ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2818057 /* reguse:VReg_64 */, %14 ; GCN-NEXT: S_ENDPGM 0 %v0 = call i32 asm sideeffect "; def $0", "=v"() %tmp = insertelement <2 x i32> poison, i32 %v0, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir b/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir index da6b57c776796..2fd1e36c4181e 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir +++ b/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir @@ -28,9 +28,9 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (load (s32), addrspace 3) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]] - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0, 1835018 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1 + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]] + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0, 1245194 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1 ; CHECK-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1 ; CHECK-NEXT: $sgpr10 = S_MOV_B32 -1 ; CHECK-NEXT: S_BRANCH %bb.1 @@ -41,9 +41,9 @@ body: | bb.1: %2:vgpr_32 = DS_READ_B32_gfx9 %1, 0, 0, implicit $exec :: (load (s32), addrspace 3) - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3) - INLINEASM &"", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %2 - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %0.sub0, 1835018 /* regdef:VGPR_32 */, def %0.sub1 + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3) + INLINEASM &"", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %2 + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %0.sub0, 1245194 /* regdef:VGPR_32 */, def %0.sub1 S_NOP 0, implicit %0.sub1 $sgpr10 = S_MOV_B32 -1 S_BRANCH %bb.1 @@ -69,9 +69,9 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (load (s32), addrspace 3) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]] - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1, 1835018 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0 + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]] + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1, 1245194 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0 ; CHECK-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1 ; CHECK-NEXT: $sgpr10 = S_MOV_B32 -1 ; CHECK-NEXT: S_BRANCH %bb.1 @@ -82,9 +82,9 @@ body: | bb.1: %2:vgpr_32 = DS_READ_B32_gfx9 %1, 0, 0, implicit $exec :: (load (s32), addrspace 3) - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3) - INLINEASM &"", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %2 - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %0.sub1, 1835018 /* regdef:VGPR_32 */, def undef %0.sub0 + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3) + INLINEASM &"", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %2 + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %0.sub1, 1245194 /* regdef:VGPR_32 */, def undef %0.sub0 S_NOP 0, implicit %0.sub1 $sgpr10 = S_MOV_B32 -1 S_BRANCH %bb.1 diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected index f515ee651d835..8d50df7050636 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected @@ -7,10 +7,10 @@ define i64 @i64_test(i64 %i) nounwind readnone { ; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8 ; CHECK-NEXT: t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %9 -; CHECK-NEXT: t50: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<75>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11> +; CHECK-NEXT: t50: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<66>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11> ; CHECK-NEXT: t27: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t30: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<4>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 -; CHECK-NEXT: t33: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<75>, t27, TargetConstant:i32<3>, t30, TargetConstant:i32<11> +; CHECK-NEXT: t33: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<66>, t27, TargetConstant:i32<3>, t30, TargetConstant:i32<11> ; CHECK-NEXT: t10: i64 = V_ADD_U64_PSEUDO # D:1 t50, t33 ; CHECK-NEXT: t24: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<3> ; CHECK-NEXT: t17: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t24 From 34a476381b19940042883c0419fe80da4a296b96 Mon Sep 17 00:00:00 2001 From: Ron Lieberman Date: Tue, 11 Nov 2025 20:45:48 -0600 Subject: [PATCH 64/64] mar unstable llvm/test/MC/MachO/invalid-section-index.s --- llvm/test/MC/MachO/invalid-section-index.s | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/test/MC/MachO/invalid-section-index.s b/llvm/test/MC/MachO/invalid-section-index.s index 55a0ce5b40ea7..80e1c64d9cbd5 100644 --- a/llvm/test/MC/MachO/invalid-section-index.s +++ b/llvm/test/MC/MachO/invalid-section-index.s @@ -1,4 +1,5 @@ /// Test that when there are more than 255 sections, error is shown specifying too many sections. +// REQUIRES: stability // RUN: not llvm-mc -filetype=obj -triple arm64-apple-macos %s -o - 2>&1 | FileCheck %s --check-prefix=MACHOERROR