From 0bd07652524ebacdee166eb609fef48c50769b09 Mon Sep 17 00:00:00 2001 From: Matthias Gehre Date: Fri, 17 Jan 2025 09:06:04 +0100 Subject: [PATCH 01/45] EmitC: Allow arrays of size zero (#123292) This is allowed as a GCC extension, see https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html. --- mlir/docs/Dialects/emitc.md | 2 ++ mlir/lib/Dialect/EmitC/IR/EmitC.cpp | 4 ++-- mlir/test/Dialect/EmitC/invalid_types.mlir | 8 -------- mlir/test/Dialect/EmitC/types.mlir | 4 +++- 4 files changed, 7 insertions(+), 11 deletions(-) diff --git a/mlir/docs/Dialects/emitc.md b/mlir/docs/Dialects/emitc.md index 743d70959f3d8..e2288f518dae1 100644 --- a/mlir/docs/Dialects/emitc.md +++ b/mlir/docs/Dialects/emitc.md @@ -16,6 +16,8 @@ The following convention is followed: floating types. * If `__bf16` is used, the code requires a compiler that supports it, such as GCC or Clang. +* If `emitc.array` with a dimension of size zero is used, then the code + requires [a GCC extension](https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html). * Else the generated code is compatible with C99. These restrictions are neither inherent to the EmitC dialect itself nor to the diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp index fdc21d6c6e24b..c818dd18a3d24 100644 --- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp +++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp @@ -971,8 +971,8 @@ LogicalResult emitc::ArrayType::verify( return emitError() << "shape must not be empty"; for (int64_t dim : shape) { - if (dim <= 0) - return emitError() << "dimensions must have positive size"; + if (dim < 0) + return emitError() << "dimensions must have non-negative size"; } if (!elementType) diff --git a/mlir/test/Dialect/EmitC/invalid_types.mlir b/mlir/test/Dialect/EmitC/invalid_types.mlir index 302a345c7c4f4..c39a881ff26ad 100644 --- a/mlir/test/Dialect/EmitC/invalid_types.mlir +++ b/mlir/test/Dialect/EmitC/invalid_types.mlir @@ -36,14 +36,6 @@ func.func @illegal_array_missing_x( // ----- -func.func @illegal_array_non_positive_dimenson( - // expected-error @+1 {{dimensions must have positive size}} - %arg0: !emitc.array<0xi32> -) { -} - -// ----- - func.func @illegal_array_missing_type( // expected-error @+1 {{expected non-function type}} %arg0: !emitc.array<10x> diff --git a/mlir/test/Dialect/EmitC/types.mlir b/mlir/test/Dialect/EmitC/types.mlir index e3462bffc5b0d..d4dd94457f39b 100644 --- a/mlir/test/Dialect/EmitC/types.mlir +++ b/mlir/test/Dialect/EmitC/types.mlir @@ -17,7 +17,9 @@ func.func @array_types( // CHECK-SAME: !emitc.array<30x!emitc.ssize_t> %arg5: !emitc.array<30x!emitc.ssize_t>, // CHECK-SAME: !emitc.array<30x!emitc.ptrdiff_t> - %arg6: !emitc.array<30x!emitc.ptrdiff_t> + %arg6: !emitc.array<30x!emitc.ptrdiff_t>, + // CHECK-SAME: !emitc.array<0xi64> + %arg7: !emitc.array<0xi64> ) { return } From 1274bca2ad5befe56d82ef76100e2c294ca57ce2 Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Fri, 17 Jan 2025 16:06:31 +0800 Subject: [PATCH 02/45] [X86][APX] Support APX + MOVRS (#123264) Ref.: https://cdrdv2.intel.com/v1/dl/getContent/784266 --- llvm/lib/Target/X86/X86InstrAVX10.td | 4 +- llvm/lib/Target/X86/X86InstrMisc.td | 21 ++++- llvm/test/CodeGen/X86/movrs-builtins.ll | 21 +++++ llvm/test/MC/Disassembler/X86/movrs.txt | 98 +++++++++++++++++++++++- llvm/test/MC/X86/movrs-att-64.s | 98 +++++++++++++++++++++++- llvm/test/MC/X86/movrs-intel-64.s | 98 +++++++++++++++++++++++- llvm/test/TableGen/x86-instr-mapping.inc | 4 + 7 files changed, 337 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td index 127016184bc17..edbcb17297603 100644 --- a/llvm/lib/Target/X86/X86InstrAVX10.td +++ b/llvm/lib/Target/X86/X86InstrAVX10.td @@ -1767,9 +1767,9 @@ multiclass vmovrs_p opc, string OpStr, X86VectorVTInfo _> { } multiclass vmovrs_p_vl opc, string OpStr, AVX512VLVectorVTInfo _Vec> { - let Predicates = [HasMOVRS, HasAVX10_2_512] in + let Predicates = [HasMOVRS, HasAVX10_2_512, In64BitMode] in defm Z : vmovrs_p, EVEX_V512; - let Predicates = [HasMOVRS, HasAVX10_2] in { + let Predicates = [HasMOVRS, HasAVX10_2, In64BitMode] in { defm Z128 : vmovrs_p, EVEX_V128; defm Z256 : vmovrs_p, EVEX_V256; } diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td index 9fabe2acf0019..43c02c4f85844 100644 --- a/llvm/lib/Target/X86/X86InstrMisc.td +++ b/llvm/lib/Target/X86/X86InstrMisc.td @@ -1733,7 +1733,7 @@ def CLDEMOTE : I<0x1C, MRM0m, (outs), (ins i8mem:$src), "cldemote\t$src", // let SchedRW = [WriteLoad] in { -let Predicates = [HasMOVRS, NoEGPR] in { +let Predicates = [HasMOVRS, NoEGPR, In64BitMode] in { def MOVRS8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src), "movrs{b}\t{$src, $dst|$dst, $src}", [(set GR8:$dst, (int_x86_movrsqi addr:$src))]>, T8; @@ -1746,8 +1746,25 @@ def MOVRS32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), def MOVRS64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "movrs{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (int_x86_movrsdi addr:$src))]>, T8; +} + +let Predicates = [HasMOVRS] in def PREFETCHRST2 : I<0x18, MRM4m, (outs), (ins i8mem:$src), "prefetchrst2\t$src", [(int_x86_prefetchrs addr:$src)]>, TB; + +let Predicates = [HasMOVRS, HasEGPR, In64BitMode] in { +def MOVRS8rm_EVEX : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src), + "movrs{b}\t{$src, $dst|$dst, $src}", + [(set GR8:$dst, (int_x86_movrsqi addr:$src))]>, EVEX, NoCD8, T_MAP4; +def MOVRS16rm_EVEX : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "movrs{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (int_x86_movrshi addr:$src))]>, EVEX, NoCD8, PD, T_MAP4; +def MOVRS32rm_EVEX : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "movrs{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_movrssi addr:$src))]>, EVEX, NoCD8, T_MAP4; +def MOVRS64rm_EVEX : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "movrs{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (int_x86_movrsdi addr:$src))]>, EVEX, NoCD8, T_MAP4, REX_W; +} } -} \ No newline at end of file diff --git a/llvm/test/CodeGen/X86/movrs-builtins.ll b/llvm/test/CodeGen/X86/movrs-builtins.ll index c1722c831c95d..ccf0833e53990 100644 --- a/llvm/test/CodeGen/X86/movrs-builtins.ll +++ b/llvm/test/CodeGen/X86/movrs-builtins.ll @@ -1,11 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+movrs | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+movrs,+egpr | FileCheck %s --check-prefix=EGPR define i8 @test_movrs_si8(ptr %__A) { ; CHECK-LABEL: test_movrs_si8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movrsb (%rdi), %al # encoding: [0x0f,0x38,0x8a,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; EGPR-LABEL: test_movrs_si8: +; EGPR: # %bb.0: # %entry +; EGPR-NEXT: movrsb (%rdi), %al # EVEX TO LEGACY Compression encoding: [0x0f,0x38,0x8a,0x07] +; EGPR-NEXT: retq # encoding: [0xc3] entry: %0 = call i8 @llvm.x86.movrsqi(ptr %__A) ret i8 %0 @@ -17,6 +23,11 @@ define i16 @test_movrs_si16(ptr %__A) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movrsw (%rdi), %ax # encoding: [0x66,0x0f,0x38,0x8b,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; EGPR-LABEL: test_movrs_si16: +; EGPR: # %bb.0: # %entry +; EGPR-NEXT: movrsw (%rdi), %ax # EVEX TO LEGACY Compression encoding: [0x66,0x0f,0x38,0x8b,0x07] +; EGPR-NEXT: retq # encoding: [0xc3] entry: %0 = call i16 @llvm.x86.movrshi(ptr %__A) ret i16 %0 @@ -28,6 +39,11 @@ define i32 @test_movrs_si32(ptr %__A) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movrsl (%rdi), %eax # encoding: [0x0f,0x38,0x8b,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; EGPR-LABEL: test_movrs_si32: +; EGPR: # %bb.0: # %entry +; EGPR-NEXT: movrsl (%rdi), %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x38,0x8b,0x07] +; EGPR-NEXT: retq # encoding: [0xc3] entry: %0 = call i32 @llvm.x86.movrssi(ptr %__A) ret i32 %0 @@ -39,6 +55,11 @@ define i64 @test_movrs_si64(ptr %__A) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movrsq (%rdi), %rax # encoding: [0x48,0x0f,0x38,0x8b,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; EGPR-LABEL: test_movrs_si64: +; EGPR: # %bb.0: # %entry +; EGPR-NEXT: movrsq (%rdi), %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x38,0x8b,0x07] +; EGPR-NEXT: retq # encoding: [0xc3] entry: %0 = call i64 @llvm.x86.movrsdi(ptr %__A) ret i64 %0 diff --git a/llvm/test/MC/Disassembler/X86/movrs.txt b/llvm/test/MC/Disassembler/X86/movrs.txt index fa91b542d3f73..caac8bc8b7b30 100644 --- a/llvm/test/MC/Disassembler/X86/movrs.txt +++ b/llvm/test/MC/Disassembler/X86/movrs.txt @@ -95,4 +95,100 @@ # ATT: movrsq -128(%rdx), %rbx # INTEL: movrs rbx, qword ptr [rdx - 128] -0x48,0x0f,0x38,0x8b,0x5a,0x80 \ No newline at end of file +0x48,0x0f,0x38,0x8b,0x5a,0x80 + +# ATT: movrsb 268435456(%rbp,%r14,8), %r16b +# INTEL: movrs r16b, byte ptr [rbp + 8*r14 + 268435456] +0x62,0xa4,0x7c,0x08,0x8a,0x84,0xf5,0x00,0x00,0x00,0x10 + +# ATT: movrsb 291(%r17,%rax,4), %bl +# INTEL: movrs bl, byte ptr [r17 + 4*rax + 291] +0x62,0xfc,0x7c,0x08,0x8a,0x9c,0x81,0x23,0x01,0x00,0x00 + +# ATT: movrsb (%rip), %bl +# INTEL: movrs bl, byte ptr [rip] +0x62,0xf4,0x7c,0x08,0x8a,0x1d,0x00,0x00,0x00,0x00 + +# ATT: movrsb -32(,%rbp,2), %r18b +# INTEL: movrs r18b, byte ptr [2*rbp - 32] +0x62,0xe4,0x7c,0x08,0x8a,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: movrsb 127(%r19), %bl +# INTEL: movrs bl, byte ptr [r19 + 127] +0x62,0xfc,0x7c,0x08,0x8a,0x5b,0x7f + +# ATT: movrsb -128(%r20,%riz), %bl +# INTEL: movrs bl, byte ptr [r20 + riz - 128] +0x62,0xfc,0x7c,0x08,0x8a,0x5c,0x24,0x80 + +# ATT: movrsw 268435456(%rbp,%r14,8), %r16w +# INTEL: movrs r16w, word ptr [rbp + 8*r14 + 268435456] +0x62,0xa4,0x7d,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10 + +# ATT: movrsw 291(%r17,%rax,4), %bx +# INTEL: movrs bx, word ptr [r17 + 4*rax + 291] +0x62,0xfc,0x7d,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00 + +# ATT: movrsw (%rip), %bx +# INTEL: movrs bx, word ptr [rip] +0x62,0xf4,0x7d,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00 + +# ATT: movrsw -32(,%rbp,2), %r18w +# INTEL: movrs r18w, word ptr [2*rbp - 32] +0x62,0xe4,0x7d,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: movrsw 127(%r19), %bx +# INTEL: movrs bx, word ptr [r19 + 127] +0x62,0xfc,0x7d,0x08,0x8b,0x5b,0x7f + +# ATT: movrsw -128(%r20,%riz), %bx +# INTEL: movrs bx, word ptr [r20 + riz - 128] +0x62,0xfc,0x7d,0x08,0x8b,0x5c,0x24,0x80 + +# ATT: movrsl 268435456(%rbp,%r14,8), %r16d +# INTEL: movrs r16d, dword ptr [rbp + 8*r14 + 268435456] +0x62,0xa4,0x7c,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10 + +# ATT: movrsl 291(%r17,%rax,4), %ebx +# INTEL: movrs ebx, dword ptr [r17 + 4*rax + 291] +0x62,0xfc,0x7c,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00 + +# ATT: movrsl (%rip), %ebx +# INTEL: movrs ebx, dword ptr [rip] +0x62,0xf4,0x7c,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00 + +# ATT: movrsl -32(,%rbp,2), %r18d +# INTEL: movrs r18d, dword ptr [2*rbp - 32] +0x62,0xe4,0x7c,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: movrsl 127(%r19), %ebx +# INTEL: movrs ebx, dword ptr [r19 + 127] +0x62,0xfc,0x7c,0x08,0x8b,0x5b,0x7f + +# ATT: movrsl -128(%r20,%riz), %ebx +# INTEL: movrs ebx, dword ptr [r20 + riz - 128] +0x62,0xfc,0x7c,0x08,0x8b,0x5c,0x24,0x80 + +# ATT: movrsq 268435456(%rbp,%r14,8), %r16 +# INTEL: movrs r16, qword ptr [rbp + 8*r14 + 268435456] +0x62,0xa4,0xfc,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10 + +# ATT: movrsq 291(%r17,%rax,4), %rbx +# INTEL: movrs rbx, qword ptr [r17 + 4*rax + 291] +0x62,0xfc,0xfc,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00 + +# ATT: movrsq (%rip), %rbx +# INTEL: movrs rbx, qword ptr [rip] +0x62,0xf4,0xfc,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00 + +# ATT: movrsq -32(,%rbp,2), %r18 +# INTEL: movrs r18, qword ptr [2*rbp - 32] +0x62,0xe4,0xfc,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: movrsq 127(%r19), %rbx +# INTEL: movrs rbx, qword ptr [r19 + 127] +0x62,0xfc,0xfc,0x08,0x8b,0x5b,0x7f + +# ATT: movrsq -128(%r20,%riz), %rbx +# INTEL: movrs rbx, qword ptr [r20 + riz - 128] +0x62,0xfc,0xfc,0x08,0x8b,0x5c,0x24,0x80 diff --git a/llvm/test/MC/X86/movrs-att-64.s b/llvm/test/MC/X86/movrs-att-64.s index 59a2fdb6d10b2..e951b30369d46 100644 --- a/llvm/test/MC/X86/movrs-att-64.s +++ b/llvm/test/MC/X86/movrs-att-64.s @@ -94,4 +94,100 @@ // CHECK: movrsq -128(%rdx), %rbx // CHECK: encoding: [0x48,0x0f,0x38,0x8b,0x5a,0x80] - movrs -128(%rdx), %rbx \ No newline at end of file + movrs -128(%rdx), %rbx + +// CHECK: movrsb 268435456(%rbp,%r14,8), %r16b +// CHECK: encoding: [0x62,0xa4,0x7c,0x08,0x8a,0x84,0xf5,0x00,0x00,0x00,0x10] + movrs 268435456(%rbp,%r14,8), %r16b + +// CHECK: movrsb 291(%r17,%rax,4), %bl +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x9c,0x81,0x23,0x01,0x00,0x00] + movrs 291(%r17,%rax,4), %bl + +// CHECK: {evex} movrsb (%rip), %bl +// CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x8a,0x1d,0x00,0x00,0x00,0x00] + {evex} movrs (%rip), %bl + +// CHECK: movrsb -32(,%rbp,2), %r18b +// CHECK: encoding: [0x62,0xe4,0x7c,0x08,0x8a,0x14,0x6d,0xe0,0xff,0xff,0xff] + movrs -32(,%rbp,2), %r18b + +// CHECK: movrsb 127(%r19), %bl +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x5b,0x7f] + movrs 127(%r19), %bl + +// CHECK: movrsb -128(%r20), %bl +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x5c,0x24,0x80] + movrs -128(%r20), %bl + +// CHECK: movrsw 268435456(%rbp,%r14,8), %r16w +// CHECK: encoding: [0x62,0xa4,0x7d,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10] + movrs 268435456(%rbp,%r14,8), %r16w + +// CHECK: movrsw 291(%r17,%rax,4), %bx +// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00] + movrs 291(%r17,%rax,4), %bx + +// CHECK: {evex} movrsw (%rip), %bx +// CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00] + {evex} movrs (%rip), %bx + +// CHECK: movrsw -32(,%rbp,2), %r18w +// CHECK: encoding: [0x62,0xe4,0x7d,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff] + movrs -32(,%rbp,2), %r18w + +// CHECK: movrsw 127(%r19), %bx +// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x5b,0x7f] + movrs 127(%r19), %bx + +// CHECK: movrsw -128(%r20), %bx +// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x5c,0x24,0x80] + movrs -128(%r20), %bx + +// CHECK: movrsl 268435456(%rbp,%r14,8), %r16d +// CHECK: encoding: [0x62,0xa4,0x7c,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10] + movrs 268435456(%rbp,%r14,8), %r16d + +// CHECK: movrsl 291(%r17,%rax,4), %ebx +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00] + movrs 291(%r17,%rax,4), %ebx + +// CHECK: {evex} movrsl (%rip), %ebx +// CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00] + {evex} movrs (%rip), %ebx + +// CHECK: movrsl -32(,%rbp,2), %r18d +// CHECK: encoding: [0x62,0xe4,0x7c,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff] + movrs -32(,%rbp,2), %r18d + +// CHECK: movrsl 127(%r19), %ebx +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x5b,0x7f] + movrs 127(%r19), %ebx + +// CHECK: movrsl -128(%r20), %ebx +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x5c,0x24,0x80] + movrs -128(%r20), %ebx + +// CHECK: movrsq 268435456(%rbp,%r14,8), %r16 +// CHECK: encoding: [0x62,0xa4,0xfc,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10] + movrs 268435456(%rbp,%r14,8), %r16 + +// CHECK: movrsq 291(%r17,%rax,4), %rbx +// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00] + movrs 291(%r17,%rax,4), %rbx + +// CHECK: {evex} movrsq (%rip), %rbx +// CHECK: encoding: [0x62,0xf4,0xfc,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00] + {evex} movrs (%rip), %rbx + +// CHECK: movrsq -32(,%rbp,2), %r18 +// CHECK: encoding: [0x62,0xe4,0xfc,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff] + movrs -32(,%rbp,2), %r18 + +// CHECK: movrsq 127(%r19), %rbx +// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x5b,0x7f] + movrs 127(%r19), %rbx + +// CHECK: movrsq -128(%r20), %rbx +// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x5c,0x24,0x80] + movrs -128(%r20), %rbx diff --git a/llvm/test/MC/X86/movrs-intel-64.s b/llvm/test/MC/X86/movrs-intel-64.s index f41075a21b3e8..f698f1c440442 100644 --- a/llvm/test/MC/X86/movrs-intel-64.s +++ b/llvm/test/MC/X86/movrs-intel-64.s @@ -94,4 +94,100 @@ // CHECK: movrs rbx, qword ptr [rdx - 128] // CHECK: encoding: [0x48,0x0f,0x38,0x8b,0x5a,0x80] - movrs rbx, qword ptr [rdx - 128] \ No newline at end of file + movrs rbx, qword ptr [rdx - 128] + +// CHECK: movrs r16b, byte ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa4,0x7c,0x08,0x8a,0x84,0xf5,0x00,0x00,0x00,0x10] + movrs r16b, byte ptr [rbp + 8*r14 + 268435456] + +// CHECK: movrs bl, byte ptr [r17 + 4*rax + 291] +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x9c,0x81,0x23,0x01,0x00,0x00] + movrs bl, byte ptr [r17 + 4*rax + 291] + +// CHECK: {evex} movrs bl, byte ptr [rip] +// CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x8a,0x1d,0x00,0x00,0x00,0x00] + {evex} movrs bl, byte ptr [rip] + +// CHECK: movrs r18b, byte ptr [2*rbp - 32] +// CHECK: encoding: [0x62,0xe4,0x7c,0x08,0x8a,0x14,0x6d,0xe0,0xff,0xff,0xff] + movrs r18b, byte ptr [2*rbp - 32] + +// CHECK: movrs bl, byte ptr [r19 + 127] +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x5b,0x7f] + movrs bl, byte ptr [r19 + 127] + +// CHECK: movrs bl, byte ptr [r20 - 128] +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x5c,0x24,0x80] + movrs bl, byte ptr [r20 - 128] + +// CHECK: movrs r16w, word ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa4,0x7d,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10] + movrs r16w, word ptr [rbp + 8*r14 + 268435456] + +// CHECK: movrs bx, word ptr [r17 + 4*rax + 291] +// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00] + movrs bx, word ptr [r17 + 4*rax + 291] + +// CHECK: {evex} movrs bx, word ptr [rip] +// CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00] + {evex} movrs bx, word ptr [rip] + +// CHECK: movrs r18w, word ptr [2*rbp - 32] +// CHECK: encoding: [0x62,0xe4,0x7d,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff] + movrs r18w, word ptr [2*rbp - 32] + +// CHECK: movrs bx, word ptr [r19 + 127] +// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x5b,0x7f] + movrs bx, word ptr [r19 + 127] + +// CHECK: movrs bx, word ptr [r20 - 128] +// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x5c,0x24,0x80] + movrs bx, word ptr [r20 - 128] + +// CHECK: movrs r16d, dword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa4,0x7c,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10] + movrs r16d, dword ptr [rbp + 8*r14 + 268435456] + +// CHECK: movrs ebx, dword ptr [r17 + 4*rax + 291] +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00] + movrs ebx, dword ptr [r17 + 4*rax + 291] + +// CHECK: {evex} movrs ebx, dword ptr [rip] +// CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00] + {evex} movrs ebx, dword ptr [rip] + +// CHECK: movrs r18d, dword ptr [2*rbp - 32] +// CHECK: encoding: [0x62,0xe4,0x7c,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff] + movrs r18d, dword ptr [2*rbp - 32] + +// CHECK: movrs ebx, dword ptr [r19 + 127] +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x5b,0x7f] + movrs ebx, dword ptr [r19 + 127] + +// CHECK: movrs ebx, dword ptr [r20 - 128] +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x5c,0x24,0x80] + movrs ebx, dword ptr [r20 - 128] + +// CHECK: movrs r16, qword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa4,0xfc,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10] + movrs r16, qword ptr [rbp + 8*r14 + 268435456] + +// CHECK: movrs rbx, qword ptr [r17 + 4*rax + 291] +// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00] + movrs rbx, qword ptr [r17 + 4*rax + 291] + +// CHECK: {evex} movrs rbx, qword ptr [rip] +// CHECK: encoding: [0x62,0xf4,0xfc,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00] + {evex} movrs rbx, qword ptr [rip] + +// CHECK: movrs r18, qword ptr [2*rbp - 32] +// CHECK: encoding: [0x62,0xe4,0xfc,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff] + movrs r18, qword ptr [2*rbp - 32] + +// CHECK: movrs rbx, qword ptr [r19 + 127] +// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x5b,0x7f] + movrs rbx, qword ptr [r19 + 127] + +// CHECK: movrs rbx, qword ptr [r20 - 128] +// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x5c,0x24,0x80] + movrs rbx, qword ptr [r20 - 128] diff --git a/llvm/test/TableGen/x86-instr-mapping.inc b/llvm/test/TableGen/x86-instr-mapping.inc index ed43684db2dfc..55d392f5e271f 100644 --- a/llvm/test/TableGen/x86-instr-mapping.inc +++ b/llvm/test/TableGen/x86-instr-mapping.inc @@ -133,6 +133,10 @@ static const X86TableEntry X86CompressEVEXTable[] = { { X86::MOVDIR64B64_EVEX, X86::MOVDIR64B64 }, { X86::MOVDIRI32_EVEX, X86::MOVDIRI32 }, { X86::MOVDIRI64_EVEX, X86::MOVDIRI64 }, + { X86::MOVRS16rm_EVEX, X86::MOVRS16rm }, + { X86::MOVRS32rm_EVEX, X86::MOVRS32rm }, + { X86::MOVRS64rm_EVEX, X86::MOVRS64rm }, + { X86::MOVRS8rm_EVEX, X86::MOVRS8rm }, { X86::MULX32rm_EVEX, X86::MULX32rm }, { X86::MULX32rr_EVEX, X86::MULX32rr }, { X86::MULX64rm_EVEX, X86::MULX64rm }, From c3ba6f378ef80d750e2278560c6f95a300114412 Mon Sep 17 00:00:00 2001 From: Viktoriia Bakalova <115406782+VitaNuo@users.noreply.github.com> Date: Fri, 17 Jan 2025 09:10:58 +0100 Subject: [PATCH 03/45] =?UTF-8?q?[Modules]=20Delay=20deserialization=20of?= =?UTF-8?q?=20preferred=5Fname=20attribute=20at=20r=E2=80=A6=20(#122726)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …ecord level. This fixes the incorrect diagnostic emitted when compiling the following snippet ``` // string_view.h template class basic_string_view; typedef basic_string_view string_view; template class __attribute__((__preferred_name__(string_view))) basic_string_view { public: basic_string_view() { } }; inline basic_string_view foo() { return basic_string_view(); } // A.cppm module; #include "string_view.h" export module A; // Use.cppm module; #include "string_view.h" export module Use; import A; ``` The diagnostic is ``` string_view.h:11:5: error: 'basic_string_view::basic_string_view' from module 'A.' is not present in definition of 'string_view' provided earlier ``` The underlying issue is that deserialization of the `preferred_name` attribute triggers deserialization of `basic_string_view`, which triggers the deserialization of the `preferred_name` attribute again (since it's attached to the `basic_string_view` template). The deserialization logic is implemented in a way that prevents it from going on a loop in a literal sense (it detects early on that it has already seen the `string_view` typedef when trying to start its deserialization for the second time), but leaves the typedef deserialization in an unfinished state. Subsequently, the `string_view` typedef from the deserialized module cannot be merged with the same typedef from `string_view.h`, resulting in the above diagnostic. This PR resolves the problem by delaying the deserialization of the `preferred_name` attribute until the deserialization of the `basic_string_view` template is completed. As a result of deferring, the deserialization of the `preferred_name` attribute doesn't need to go on a loop since the type of the `string_view` typedef is already known when it's deserialized. --- clang/include/clang/AST/Attr.h | 14 +++- clang/include/clang/Basic/Attr.td | 11 +++ clang/include/clang/Serialization/ASTReader.h | 19 +++++ .../clang/Serialization/ASTRecordReader.h | 13 ++- clang/lib/Serialization/ASTReader.cpp | 5 ++ clang/lib/Serialization/ASTReaderDecl.cpp | 79 ++++++++++++++++++- clang/lib/Serialization/ASTWriter.cpp | 16 ++-- clang/test/Modules/preferred_name.cppm | 12 ++- clang/utils/TableGen/ClangAttrEmitter.cpp | 4 + 9 files changed, 156 insertions(+), 17 deletions(-) diff --git a/clang/include/clang/AST/Attr.h b/clang/include/clang/AST/Attr.h index 3365ebe4d9012..bed532a84a1bd 100644 --- a/clang/include/clang/AST/Attr.h +++ b/clang/include/clang/AST/Attr.h @@ -60,6 +60,8 @@ class Attr : public AttributeCommonInfo { unsigned IsLateParsed : 1; LLVM_PREFERRED_TYPE(bool) unsigned InheritEvenIfAlreadyPresent : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned DeferDeserialization : 1; void *operator new(size_t bytes) noexcept { llvm_unreachable("Attrs cannot be allocated with regular 'new'."); @@ -80,10 +82,11 @@ class Attr : public AttributeCommonInfo { protected: Attr(ASTContext &Context, const AttributeCommonInfo &CommonInfo, - attr::Kind AK, bool IsLateParsed) + attr::Kind AK, bool IsLateParsed, bool DeferDeserialization = false) : AttributeCommonInfo(CommonInfo), AttrKind(AK), Inherited(false), IsPackExpansion(false), Implicit(false), IsLateParsed(IsLateParsed), - InheritEvenIfAlreadyPresent(false) {} + InheritEvenIfAlreadyPresent(false), + DeferDeserialization(DeferDeserialization) {} public: attr::Kind getKind() const { return static_cast(AttrKind); } @@ -105,6 +108,8 @@ class Attr : public AttributeCommonInfo { void setPackExpansion(bool PE) { IsPackExpansion = PE; } bool isPackExpansion() const { return IsPackExpansion; } + bool shouldDeferDeserialization() const { return DeferDeserialization; } + // Clone this attribute. Attr *clone(ASTContext &C) const; @@ -146,8 +151,9 @@ class InheritableAttr : public Attr { protected: InheritableAttr(ASTContext &Context, const AttributeCommonInfo &CommonInfo, attr::Kind AK, bool IsLateParsed, - bool InheritEvenIfAlreadyPresent) - : Attr(Context, CommonInfo, AK, IsLateParsed) { + bool InheritEvenIfAlreadyPresent, + bool DeferDeserialization = false) + : Attr(Context, CommonInfo, AK, IsLateParsed, DeferDeserialization) { this->InheritEvenIfAlreadyPresent = InheritEvenIfAlreadyPresent; } diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 408d3adf370c8..3969dd8af5dfa 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -713,6 +713,12 @@ class Attr { // attribute may be documented under multiple categories, more than one // Documentation entry may be listed. list Documentation; + // Set to true if deserialization of this attribute must be deferred until + // the parent Decl is fully deserialized (during header module file + // deserialization). E.g., this is the case for the preferred_name attribute, + // since its type deserialization depends on its target Decl type. + // (See https://github.com/llvm/llvm-project/issues/56490 for details). + bit DeferDeserialization = 0; } /// Used to define a set of mutually exclusive attributes. @@ -3254,6 +3260,11 @@ def PreferredName : InheritableAttr { let InheritEvenIfAlreadyPresent = 1; let MeaningfulToClassTemplateDefinition = 1; let TemplateDependent = 1; + // Type of this attribute depends on the target Decl type. + // Therefore, its deserialization must be deferred until + // deserialization of the target Decl is complete + // (for header modules). + let DeferDeserialization = 1; } def PreserveMost : DeclOrTypeAttr { diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index d77bb01c5aa59..c839215dc4077 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -1221,6 +1221,24 @@ class ASTReader /// been completed. std::deque PendingDeclContextInfos; + /// Deserialization of some attributes must be deferred since they refer + /// to themselves in their type (e.g., preferred_name attribute refers to the + /// typedef that refers back to the template specialization of the template + /// that the attribute is attached to). + /// More attributes that store TypeSourceInfo might be potentially affected, + /// see https://github.com/llvm/llvm-project/issues/56490 for details. + struct DeferredAttribute { + // Index of the deferred attribute in the Record of the TargetedDecl. + uint64_t RecordIdx; + // Decl to attach a deferred attribute to. + Decl *TargetedDecl; + }; + + /// The collection of Decls that have been loaded but some of their attributes + /// have been deferred, paired with the index inside the record pointing + /// at the skipped attribute. + SmallVector PendingDeferredAttributes; + template using DuplicateObjCDecls = std::pair; @@ -1570,6 +1588,7 @@ class ASTReader void loadPendingDeclChain(Decl *D, uint64_t LocalOffset); void loadObjCCategories(GlobalDeclID ID, ObjCInterfaceDecl *D, unsigned PreviousGeneration = 0); + void loadDeferredAttribute(const DeferredAttribute &DA); RecordLocation getLocalBitOffset(uint64_t GlobalOffset); uint64_t getGlobalBitOffset(ModuleFile &M, uint64_t LocalOffset); diff --git a/clang/include/clang/Serialization/ASTRecordReader.h b/clang/include/clang/Serialization/ASTRecordReader.h index 2561418b78ca7..a29972fcf73a8 100644 --- a/clang/include/clang/Serialization/ASTRecordReader.h +++ b/clang/include/clang/Serialization/ASTRecordReader.h @@ -83,6 +83,12 @@ class ASTRecordReader /// Returns the current value in this record, without advancing. uint64_t peekInt() { return Record[Idx]; } + /// Returns the next N values in this record, without advancing. + uint64_t peekInts(unsigned N) { return Record[Idx + N]; } + + /// Skips the current value. + void skipInt() { Idx += 1; } + /// Skips the specified number of values. void skipInts(unsigned N) { Idx += N; } @@ -335,7 +341,12 @@ class ASTRecordReader Attr *readAttr(); /// Reads attributes from the current stream position, advancing Idx. - void readAttributes(AttrVec &Attrs); + /// For some attributes (where type depends on itself recursively), defer + /// reading the attribute until the type has been read. + void readAttributes(AttrVec &Attrs, Decl *D = nullptr); + + /// Reads one attribute from the current stream position, advancing Idx. + Attr *readOrDeferAttrFor(Decl *D); /// Read an BTFTypeTagAttr object. BTFTypeTagAttr *readBTFTypeTagAttr() { diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 202227b195585..d08dc6b1b4d93 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -10180,6 +10180,11 @@ void ASTReader::finishPendingActions() { } PendingDeducedVarTypes.clear(); + // Load the delayed preferred name attributes. + for (unsigned I = 0; I != PendingDeferredAttributes.size(); ++I) + loadDeferredAttribute(PendingDeferredAttributes[I]); + PendingDeferredAttributes.clear(); + // For each decl chain that we wanted to complete while deserializing, mark // it as "still needs to be completed". for (unsigned I = 0; I != PendingIncompleteDeclChains.size(); ++I) { diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index 1c51a7b5e460f..06dff02ac6128 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -612,7 +612,7 @@ void ASTDeclReader::VisitDecl(Decl *D) { if (HasAttrs) { AttrVec Attrs; - Record.readAttributes(Attrs); + Record.readAttributes(Attrs, D); // Avoid calling setAttrs() directly because it uses Decl::getASTContext() // internally which is unsafe during derialization. D->setAttrsImpl(Attrs, Reader.getContext()); @@ -3093,6 +3093,8 @@ class AttrReader { return Reader.readInt(); } + uint64_t peekInts(unsigned N) { return Reader.peekInts(N); } + bool readBool() { return Reader.readBool(); } SourceRange readSourceRange() { @@ -3123,18 +3125,29 @@ class AttrReader { return Reader.readVersionTuple(); } + void skipInt() { Reader.skipInts(1); } + + void skipInts(unsigned N) { Reader.skipInts(N); } + + unsigned getCurrentIdx() { return Reader.getIdx(); } + OMPTraitInfo *readOMPTraitInfo() { return Reader.readOMPTraitInfo(); } template T *readDeclAs() { return Reader.readDeclAs(); } }; } +/// Reads one attribute from the current stream position, advancing Idx. Attr *ASTRecordReader::readAttr() { AttrReader Record(*this); auto V = Record.readInt(); if (!V) return nullptr; + // Read and ignore the skip count, since attribute deserialization is not + // deferred on this pass. + Record.skipInt(); + Attr *New = nullptr; // Kind is stored as a 1-based integer because 0 is used to indicate a null // Attr pointer. @@ -3164,13 +3177,28 @@ Attr *ASTRecordReader::readAttr() { return New; } -/// Reads attributes from the current stream position. -void ASTRecordReader::readAttributes(AttrVec &Attrs) { +/// Reads attributes from the current stream position, advancing Idx. +/// For some attributes (where type depends on itself recursively), defer +/// reading the attribute until the type has been read. +void ASTRecordReader::readAttributes(AttrVec &Attrs, Decl *D) { for (unsigned I = 0, E = readInt(); I != E; ++I) - if (auto *A = readAttr()) + if (auto *A = readOrDeferAttrFor(D)) Attrs.push_back(A); } +/// Reads one attribute from the current stream position, advancing Idx. +/// For some attributes (where type depends on itself recursively), defer +/// reading the attribute until the type has been read. +Attr *ASTRecordReader::readOrDeferAttrFor(Decl *D) { + AttrReader Record(*this); + unsigned SkipCount = Record.peekInts(1); + if (!SkipCount) + return readAttr(); + Reader->PendingDeferredAttributes.push_back({Record.getCurrentIdx(), D}); + Record.skipInts(SkipCount); + return nullptr; +} + //===----------------------------------------------------------------------===// // ASTReader Implementation //===----------------------------------------------------------------------===// @@ -4459,6 +4487,49 @@ void ASTReader::loadPendingDeclChain(Decl *FirstLocal, uint64_t LocalOffset) { ASTDeclReader::attachLatestDecl(CanonDecl, MostRecent); } +void ASTReader::loadDeferredAttribute(const DeferredAttribute &DA) { + Decl *D = DA.TargetedDecl; + ModuleFile *M = getOwningModuleFile(D); + + unsigned LocalDeclIndex = D->getGlobalID().getLocalDeclIndex(); + const DeclOffset &DOffs = M->DeclOffsets[LocalDeclIndex]; + RecordLocation Loc(M, DOffs.getBitOffset(M->DeclsBlockStartOffset)); + + llvm::BitstreamCursor &Cursor = Loc.F->DeclsCursor; + SavedStreamPosition SavedPosition(Cursor); + if (llvm::Error Err = Cursor.JumpToBit(Loc.Offset)) { + Error(std::move(Err)); + } + + Expected MaybeCode = Cursor.ReadCode(); + if (!MaybeCode) { + llvm::report_fatal_error( + Twine("ASTReader::loadPreferredNameAttribute failed reading code: ") + + toString(MaybeCode.takeError())); + } + unsigned Code = MaybeCode.get(); + + ASTRecordReader Record(*this, *Loc.F); + Expected MaybeRecCode = Record.readRecord(Cursor, Code); + if (!MaybeRecCode) { + llvm::report_fatal_error( + Twine( + "ASTReader::loadPreferredNameAttribute failed reading rec code: ") + + toString(MaybeCode.takeError())); + } + unsigned RecCode = MaybeRecCode.get(); + if (RecCode < DECL_TYPEDEF || RecCode > DECL_LAST) { + llvm::report_fatal_error( + Twine("ASTReader::loadPreferredNameAttribute failed reading rec code: " + "expected valid DeclCode") + + toString(MaybeCode.takeError())); + } + + Record.skipInts(DA.RecordIdx); + Attr *A = Record.readAttr(); + getContext().getDeclAttrs(D).push_back(A); +} + namespace { /// Given an ObjC interface, goes through the modules and links to the diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 55d3c2bb56f2c..1c4f5730df312 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -37,6 +37,7 @@ #include "clang/AST/Type.h" #include "clang/AST/TypeLoc.h" #include "clang/AST/TypeLocVisitor.h" +#include "clang/Basic/AttrKinds.h" #include "clang/Basic/Diagnostic.h" #include "clang/Basic/DiagnosticOptions.h" #include "clang/Basic/FileEntry.h" @@ -5067,15 +5068,14 @@ void ASTWriter::WriteModuleFileExtension(Sema &SemaRef, void ASTRecordWriter::AddAttr(const Attr *A) { auto &Record = *this; - // FIXME: Clang can't handle the serialization/deserialization of - // preferred_name properly now. See - // https://github.com/llvm/llvm-project/issues/56490 for example. - if (!A || (isa(A) && - Writer->isWritingStdCXXNamedModules())) + if (!A) return Record.push_back(0); Record.push_back(A->getKind() + 1); // FIXME: stable encoding, target attrs + auto SkipIdx = Record.size(); + // Add placeholder for the size of deferred attribute. + Record.push_back(0); Record.AddIdentifierRef(A->getAttrName()); Record.AddIdentifierRef(A->getScopeName()); Record.AddSourceRange(A->getRange()); @@ -5086,6 +5086,12 @@ void ASTRecordWriter::AddAttr(const Attr *A) { Record.push_back(A->isRegularKeywordAttribute()); #include "clang/Serialization/AttrPCHWrite.inc" + + if (A->shouldDeferDeserialization()) { + // Record the actual size of deferred attribute (+ 1 to count the attribute + // kind). + Record[SkipIdx] = Record.size() - SkipIdx + 1; + } } /// Emit the list of attributes to the specified record. diff --git a/clang/test/Modules/preferred_name.cppm b/clang/test/Modules/preferred_name.cppm index 806781a81c5ca..86ba6ae96db99 100644 --- a/clang/test/Modules/preferred_name.cppm +++ b/clang/test/Modules/preferred_name.cppm @@ -53,10 +53,16 @@ import A; export using ::foo_templ; //--- Use1.cpp -import A; // expected-warning@foo.h:8 {{attribute declaration must precede definition}} -#include "foo.h" // expected-note@foo.h:9 {{previous definition is here}} - +// expected-no-diagnostics +import A; +#include "foo.h" //--- Use2.cpp // expected-no-diagnostics #include "foo.h" import A; + +//--- Use3.cpp +#include "foo.h" +import A; +foo test; +int size = test.size(); // expected-error {{no member named 'size' in 'foo'}} diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp index cc6a8eaebd44e..41730eba32ce2 100644 --- a/clang/utils/TableGen/ClangAttrEmitter.cpp +++ b/clang/utils/TableGen/ClangAttrEmitter.cpp @@ -3043,6 +3043,10 @@ static void emitAttributes(const RecordKeeper &Records, raw_ostream &OS, << (R.getValueAsBit("InheritEvenIfAlreadyPresent") ? "true" : "false"); } + if (R.getValueAsBit("DeferDeserialization")) { + OS << ", " + << "/*DeferDeserialization=*/true"; + } OS << ")\n"; for (auto const &ai : Args) { From 90a05f32166c4a45224a5eedbec9c5c7e21d2dbf Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 17 Jan 2025 09:26:49 +0100 Subject: [PATCH 04/45] [openmp] Support CET in z_Linux_asm.S (#123213) When libomp is built with -cf-protection, add endbr instructions to the start of functions for Intel CET support. --- openmp/runtime/src/z_Linux_asm.S | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S index cc5344cdd124a..0bf9f07a13f14 100644 --- a/openmp/runtime/src/z_Linux_asm.S +++ b/openmp/runtime/src/z_Linux_asm.S @@ -19,6 +19,16 @@ #if KMP_ARCH_X86 || KMP_ARCH_X86_64 +# if defined(__ELF__) && defined(__CET__) && defined(__has_include) +# if __has_include() +# include +# endif +# endif + +# if !defined(_CET_ENDBR) +# define _CET_ENDBR +# endif + # if KMP_MIC // the 'delay r16/r32/r64' should be used instead of the 'pause'. // The delay operation has the effect of removing the current thread from @@ -66,6 +76,7 @@ ALIGN 4 .globl KMP_PREFIX_UNDERSCORE($0) KMP_PREFIX_UNDERSCORE($0): + _CET_ENDBR .endmacro # else // KMP_OS_DARWIN # define KMP_PREFIX_UNDERSCORE(x) x //no extra underscore for Linux* OS symbols @@ -92,6 +103,7 @@ KMP_PREFIX_UNDERSCORE($0): .globl KMP_PREFIX_UNDERSCORE(\proc) KMP_PREFIX_UNDERSCORE(\proc): .cfi_startproc + _CET_ENDBR .endm .macro KMP_CFI_DEF_OFFSET sz .cfi_def_cfa_offset \sz From 3c42a774569ee06fb02ce00e2d2d2ce517c894f3 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 17 Jan 2025 09:38:00 +0100 Subject: [PATCH 05/45] [BOLT] Fix handling of LLVM_LIBDIR_SUFFIX (#122874) This fixes a number of issues introduced in #97130 when LLVM_LIBDIR_SUFFIX is a non-empty string. Make sure that the libdir is always referenced as `lib${LLVM_LIBDIR_SUFFIX}`, not as just `lib` or `${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}`. This is the standard libdir convention for all LLVM subprojects. Using `${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}` would result in a duplicate suffix. --- bolt/CMakeLists.txt | 4 ++-- bolt/runtime/CMakeLists.txt | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 9ac196ad0e821..04db160b64b05 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -163,8 +163,8 @@ if (BOLT_ENABLE_RUNTIME) add_llvm_install_targets(install-bolt_rt DEPENDS bolt_rt bolt COMPONENT bolt) - set(LIBBOLT_RT_INSTR "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib/libbolt_rt_instr.a") - set(LIBBOLT_RT_HUGIFY "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib/libbolt_rt_hugify.a") + set(LIBBOLT_RT_INSTR "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_instr.a") + set(LIBBOLT_RT_HUGIFY "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_hugify.a") endif() find_program(GNU_LD_EXECUTABLE NAMES ${LLVM_DEFAULT_TARGET_TRIPLE}-ld.bfd ld.bfd DOC "GNU ld") diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt index 40f4fbc9f30d5..0deb69a27d435 100644 --- a/bolt/runtime/CMakeLists.txt +++ b/bolt/runtime/CMakeLists.txt @@ -16,18 +16,18 @@ add_library(bolt_rt_instr STATIC instr.cpp ${CMAKE_CURRENT_BINARY_DIR}/config.h ) -set_target_properties(bolt_rt_instr PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}") +set_target_properties(bolt_rt_instr PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "lib${LLVM_LIBDIR_SUFFIX}") add_library(bolt_rt_hugify STATIC hugify.cpp ${CMAKE_CURRENT_BINARY_DIR}/config.h ) -set_target_properties(bolt_rt_hugify PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}") +set_target_properties(bolt_rt_hugify PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "lib${LLVM_LIBDIR_SUFFIX}") if(NOT BOLT_BUILT_STANDALONE) add_custom_command(TARGET bolt_rt_instr POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib/libbolt_rt_instr.a" "${LLVM_LIBRARY_DIR}") + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_instr.a" "${LLVM_LIBRARY_DIR}") add_custom_command(TARGET bolt_rt_hugify POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib/libbolt_rt_hugify.a" "${LLVM_LIBRARY_DIR}") + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_hugify.a" "${LLVM_LIBRARY_DIR}") endif() set(BOLT_RT_FLAGS @@ -53,23 +53,23 @@ target_include_directories(bolt_rt_instr PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) target_compile_options(bolt_rt_hugify PRIVATE ${BOLT_RT_FLAGS}) target_include_directories(bolt_rt_hugify PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) -install(TARGETS bolt_rt_instr DESTINATION "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}") -install(TARGETS bolt_rt_hugify DESTINATION "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}") +install(TARGETS bolt_rt_instr DESTINATION "lib${LLVM_LIBDIR_SUFFIX}") +install(TARGETS bolt_rt_hugify DESTINATION "lib${LLVM_LIBDIR_SUFFIX}") if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Darwin") add_library(bolt_rt_instr_osx STATIC instr.cpp ${CMAKE_CURRENT_BINARY_DIR}/config.h ) - set_target_properties(bolt_rt_instr_osx PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}") + set_target_properties(bolt_rt_instr_osx PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "lib${LLVM_LIBDIR_SUFFIX}") target_include_directories(bolt_rt_instr_osx PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) target_compile_options(bolt_rt_instr_osx PRIVATE -target x86_64-apple-darwin19.6.0 ${BOLT_RT_FLAGS}) - install(TARGETS bolt_rt_instr_osx DESTINATION "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}") + install(TARGETS bolt_rt_instr_osx DESTINATION "lib${LLVM_LIBDIR_SUFFIX}") if(NOT BOLT_BUILT_STANDALONE) add_custom_command(TARGET bolt_rt_instr_osx POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib/libbolt_rt_instr_osx.a" "${LLVM_LIBRARY_DIR}") + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_instr_osx.a" "${LLVM_LIBRARY_DIR}") endif() endif() From c8ba551da17c48e00c0eeb572e7667ffa5109f6f Mon Sep 17 00:00:00 2001 From: Will Froom Date: Fri, 17 Jan 2025 08:41:33 +0000 Subject: [PATCH 06/45] [AArch64] Return early rather than asserting when Size of value passed to targetShrinkDemandedConstant is not 32 or 64 (#123084) See https://github.com/llvm/llvm-project/issues/123029 for details. --- .../Target/AArch64/AArch64ISelLowering.cpp | 5 +- .../half-precision-signof-no-assert.ll | 48 +++++++++++++++++++ 2 files changed, 51 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/half-precision-signof-no-assert.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d4a114c275fb7..7d3ca46204b67 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2373,8 +2373,9 @@ bool AArch64TargetLowering::targetShrinkDemandedConstant( return false; unsigned Size = VT.getSizeInBits(); - assert((Size == 32 || Size == 64) && - "i32 or i64 is expected after legalization."); + + if (Size != 32 && Size != 64) + return false; // Exit early if we demand all bits. if (DemandedBits.popcount() == Size) diff --git a/llvm/test/CodeGen/AArch64/half-precision-signof-no-assert.ll b/llvm/test/CodeGen/AArch64/half-precision-signof-no-assert.ll new file mode 100644 index 0000000000000..92e15e78d8c41 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/half-precision-signof-no-assert.ll @@ -0,0 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +; Check that the following does not crash +; See https://github.com/llvm/llvm-project/issues/123029 for details + +define ptr @fn(ptr %in, ptr %out) { +; CHECK-LABEL: fn: +; CHECK: // %bb.0: // %fn +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: movi v0.4h, #60, lsl #8 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: fcvtl v1.4s, v1.4h +; CHECK-NEXT: fcmgt v2.4s, v1.4s, #0.0 +; CHECK-NEXT: fcmlt v1.4s, v1.4s, #0.0 +; CHECK-NEXT: orr v1.16b, v1.16b, v2.16b +; CHECK-NEXT: ldr h2, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: and v0.8b, v1.8b, v0.8b +; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: str d0, [x1] +; CHECK-NEXT: ldr h0, [x0, #8] +; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcmp s0, #0.0 +; CHECK-NEXT: fcsel s1, s2, s1, mi +; CHECK-NEXT: fcsel s1, s2, s1, gt +; CHECK-NEXT: mvni v2.4s, #128, lsl #24 +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: str h0, [x1, #8] +; CHECK-NEXT: ret +fn: + %1 = load <4 x half>, ptr %in + %2 = fcmp one <4 x half> %1, zeroinitializer + %3 = uitofp <4 x i1> %2 to <4 x half> + store <4 x half> %3, ptr %out + + %4 = getelementptr inbounds nuw i8, ptr %in, i64 8 + %5 = load half, ptr %4 + %6 = fcmp one half %5, 0xH0000 + %7 = uitofp i1 %6 to half + %8 = call half @llvm.copysign.f16(half %7, half %5) + %9 = getelementptr inbounds nuw i8, ptr %out, i64 8 + store half %8, ptr %9 + ret ptr null +} From 9720be95d63ce797437015d0f0edd10b02e80b7a Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Fri, 17 Jan 2025 16:55:35 +0800 Subject: [PATCH 07/45] [LV][EVL] Disable fixed-order recurrence idiom with EVL tail folding. (#122458) The currently llvm.splice may occurs unexpected behavior if the evl of the second-to-last iteration is not VF*UF. Issue #122461 --- .../Transforms/Vectorize/LoopVectorize.cpp | 8 +- ...ce-tail-with-evl-fixed-order-recurrence.ll | 90 +++++++++++-------- 2 files changed, 58 insertions(+), 40 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 99f6a8860f0f4..8024cde41b5f9 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1447,9 +1447,11 @@ class LoopVectorizationCostModel { // Override forced styles if needed. // FIXME: use actual opcode/data type for analysis here. // FIXME: Investigate opportunity for fixed vector factor. - bool EVLIsLegal = UserIC <= 1 && - TTI.hasActiveVectorLength(0, nullptr, Align()) && - !EnableVPlanNativePath; + bool EVLIsLegal = + UserIC <= 1 && TTI.hasActiveVectorLength(0, nullptr, Align()) && + !EnableVPlanNativePath && + // FIXME: remove this once fixed-ordered recurrence is supported. + Legal->getFixedOrderRecurrences().empty(); if (!EVLIsLegal) { // If for some reason EVL mode is unsupported, fallback to // DataWithoutLaneMask to try to vectorize the loop with folded tail diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll index 9f8cf169c0593..809b69900731a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll @@ -11,6 +11,10 @@ ; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ ; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP +; FIXME: Fixed-order recurrence is not supported yet with EVL tail folding. +; The llvm.splice may occurs unexpected behavior if the evl of the +; second-to-last iteration is not VF*UF. + define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-LABEL: define void @first_order_recurrence( ; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0:[0-9]+]] { @@ -27,31 +31,35 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TC]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() ; IF-EVL-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 ; IF-EVL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP10]], 1 ; IF-EVL-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 33, i32 [[TMP11]] +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] ; IF-EVL: [[VECTOR_BODY]]: -; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP25:%.*]] = call @llvm.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP26:%.*]] = add zeroinitializer, [[TMP25]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP26]] +; IF-EVL-NEXT: [[TMP27:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], splat (i1 true), i32 [[TMP12]]) +; IF-EVL-NEXT: [[VP_OP_LOAD]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP15]], i32 4, [[TMP27]], poison) ; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR]], [[VP_OP_LOAD]], i32 -1) -; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.add.nxv4i32( [[TMP16]], [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = add nsw [[TMP16]], [[VP_OP_LOAD]] ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0 -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP]], ptr align 4 [[TMP18]], splat (i1 true), i32 [[TMP12]]) -; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP12]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP19]], ptr [[TMP18]], i32 4, [[TMP27]]) +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]] ; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: @@ -172,6 +180,7 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TC]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() @@ -182,27 +191,30 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], 4 ; IF-EVL-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], 1 ; IF-EVL-NEXT: [[VECTOR_RECUR_INIT1:%.*]] = insertelement poison, i32 22, i32 [[TMP14]] +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] ; IF-EVL: [[VECTOR_BODY]]: -; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VECTOR_RECUR2:%.*]] = phi [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP16:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP32:%.*]] = call @llvm.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP33:%.*]] = add zeroinitializer, [[TMP32]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP33]] +; IF-EVL-NEXT: [[TMP34:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT4]] ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP16]] ; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], splat (i1 true), i32 [[TMP15]]) +; IF-EVL-NEXT: [[VP_OP_LOAD]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 4, [[TMP34]], poison) ; IF-EVL-NEXT: [[TMP19]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR]], [[VP_OP_LOAD]], i32 -1) ; IF-EVL-NEXT: [[TMP20:%.*]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR2]], [[TMP19]], i32 -1) -; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.add.nxv4i32( [[TMP19]], [[TMP20]], splat (i1 true), i32 [[TMP15]]) +; IF-EVL-NEXT: [[TMP23:%.*]] = add nsw [[TMP19]], [[TMP20]] ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP16]] ; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP21]], i32 0 -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP]], ptr align 4 [[TMP22]], splat (i1 true), i32 [[TMP15]]) -; IF-EVL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP15]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP23]], ptr [[TMP22]], i32 4, [[TMP34]]) +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]] ; IF-EVL-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: @@ -218,12 +230,12 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL: [[SCALAR_PH]]: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ] -; IF-EVL-NEXT: [[SCALAR_RECUR_INIT4:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT6:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ] ; IF-EVL-NEXT: br label %[[FOR_BODY:.*]] ; IF-EVL: [[FOR_BODY]]: ; IF-EVL-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ] ; IF-EVL-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP31:%.*]], %[[FOR_BODY]] ] -; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT6]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]] ; IF-EVL-NEXT: [[TMP31]] = load i32, ptr [[ARRAYIDX]], align 4 ; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR1]], [[FOR2]] @@ -342,6 +354,7 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TC]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() @@ -356,30 +369,33 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], 4 ; IF-EVL-NEXT: [[TMP17:%.*]] = sub i32 [[TMP16]], 1 ; IF-EVL-NEXT: [[VECTOR_RECUR_INIT3:%.*]] = insertelement poison, i32 11, i32 [[TMP17]] +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer ; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] ; IF-EVL: [[VECTOR_BODY]]: -; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VECTOR_RECUR2:%.*]] = phi [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VECTOR_RECUR4:%.*]] = phi [ [[VECTOR_RECUR_INIT3]], %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP18:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP19:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP39:%.*]] = call @llvm.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP40:%.*]] = add zeroinitializer, [[TMP39]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP40]] +; IF-EVL-NEXT: [[TMP41:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT6]] ; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP19]] ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP21]], splat (i1 true), i32 [[TMP18]]) +; IF-EVL-NEXT: [[VP_OP_LOAD]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP21]], i32 4, [[TMP41]], poison) ; IF-EVL-NEXT: [[TMP22]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR]], [[VP_OP_LOAD]], i32 -1) ; IF-EVL-NEXT: [[TMP23]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR2]], [[TMP22]], i32 -1) ; IF-EVL-NEXT: [[TMP24:%.*]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR4]], [[TMP23]], i32 -1) -; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.add.nxv4i32( [[TMP23]], [[TMP24]], splat (i1 true), i32 [[TMP18]]) -; IF-EVL-NEXT: [[VP_OP5:%.*]] = call @llvm.vp.add.nxv4i32( [[VP_OP]], [[TMP22]], splat (i1 true), i32 [[TMP18]]) +; IF-EVL-NEXT: [[TMP27:%.*]] = add nsw [[TMP23]], [[TMP24]] +; IF-EVL-NEXT: [[TMP42:%.*]] = add [[TMP27]], [[TMP22]] ; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP19]] ; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP25]], i32 0 -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP5]], ptr align 4 [[TMP26]], splat (i1 true), i32 [[TMP18]]) -; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP18]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP27]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP42]], ptr [[TMP26]], i32 4, [[TMP41]]) +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]] ; IF-EVL-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: @@ -399,14 +415,14 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL: [[SCALAR_PH]]: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ] -; IF-EVL-NEXT: [[SCALAR_RECUR_INIT8:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ] -; IF-EVL-NEXT: [[SCALAR_RECUR_INIT9:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT7]], %[[MIDDLE_BLOCK]] ], [ 11, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT9:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT10:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT7]], %[[MIDDLE_BLOCK]] ], [ 11, %[[ENTRY]] ] ; IF-EVL-NEXT: br label %[[FOR_BODY:.*]] ; IF-EVL: [[FOR_BODY]]: ; IF-EVL-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ] ; IF-EVL-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP38:%.*]], %[[FOR_BODY]] ] -; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ] -; IF-EVL-NEXT: [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT9]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT9]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT10]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]] ; IF-EVL-NEXT: [[TMP38]] = load i32, ptr [[ARRAYIDX]], align 4 ; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR2]], [[FOR3]] From 0e13ce770bfbee7cfbc8086a038a950fe12c03d5 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Fri, 17 Jan 2025 16:59:04 +0800 Subject: [PATCH 08/45] [InstCombine] Handle mul in `maintainNoSignedWrap` (#123299) Alive2: https://alive2.llvm.org/ce/z/Kgamks Closes https://github.com/llvm/llvm-project/issues/123175. For `@foo1`, the nsw flag is propagated because we first convert it into `mul nsw nuw (shl nsw nuw X, 1), 3`. --- .../InstCombine/InstructionCombining.cpp | 23 ++++--- llvm/test/Transforms/InstCombine/nsw.ll | 60 +++++++++++++++++++ 2 files changed, 74 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 2fb60ef11499c..fb21576722461 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -281,28 +281,33 @@ bool InstCombinerImpl::shouldChangeType(Type *From, Type *To) const { // Return true, if No Signed Wrap should be maintained for I. // The No Signed Wrap flag can be kept if the operation "B (I.getOpcode) C", // where both B and C should be ConstantInts, results in a constant that does -// not overflow. This function only handles the Add and Sub opcodes. For +// not overflow. This function only handles the Add/Sub/Mul opcodes. For // all other opcodes, the function conservatively returns false. static bool maintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) { auto *OBO = dyn_cast(&I); if (!OBO || !OBO->hasNoSignedWrap()) return false; - // We reason about Add and Sub Only. - Instruction::BinaryOps Opcode = I.getOpcode(); - if (Opcode != Instruction::Add && Opcode != Instruction::Sub) - return false; - const APInt *BVal, *CVal; if (!match(B, m_APInt(BVal)) || !match(C, m_APInt(CVal))) return false; + // We reason about Add/Sub/Mul Only. bool Overflow = false; - if (Opcode == Instruction::Add) + switch (I.getOpcode()) { + case Instruction::Add: (void)BVal->sadd_ov(*CVal, Overflow); - else + break; + case Instruction::Sub: (void)BVal->ssub_ov(*CVal, Overflow); - + break; + case Instruction::Mul: + (void)BVal->smul_ov(*CVal, Overflow); + break; + default: + // Conservatively return false for other opcodes. + return false; + } return !Overflow; } diff --git a/llvm/test/Transforms/InstCombine/nsw.ll b/llvm/test/Transforms/InstCombine/nsw.ll index 329a47324f862..b00f2e58add78 100644 --- a/llvm/test/Transforms/InstCombine/nsw.ll +++ b/llvm/test/Transforms/InstCombine/nsw.ll @@ -415,3 +415,63 @@ define i8 @neg_nsw_mul_missing_nsw_on_mul(i8 %a1, i8 %a2, i8 %b) { %neg = sub nsw i8 0, %shl ret i8 %neg } + +; This could propagate nsw. + +define i16 @mul_nsw_reassoc_prop(i16 %x) { +; CHECK-LABEL: @mul_nsw_reassoc_prop( +; CHECK-NEXT: [[B:%.*]] = mul nsw i16 [[X:%.*]], 6 +; CHECK-NEXT: ret i16 [[B]] +; + %a = mul nsw i16 %x, 3 + %b = mul nsw i16 %a, 2 + ret i16 %b +} + +; This could propagate nsw. + +define i16 @mul_nsw_reassoc_prop_neg(i16 %x) { +; CHECK-LABEL: @mul_nsw_reassoc_prop_neg( +; CHECK-NEXT: [[B:%.*]] = mul nsw i16 [[X:%.*]], -2201 +; CHECK-NEXT: ret i16 [[B]] +; + %a = mul nsw i16 %x, -71 + %b = mul nsw i16 %a, 31 + ret i16 %b +} + +; Must not propagate nsw. + +define i16 @mul_nsw_reassoc_prop_no_nsw1(i16 %x) { +; CHECK-LABEL: @mul_nsw_reassoc_prop_no_nsw1( +; CHECK-NEXT: [[B:%.*]] = mul i16 [[X:%.*]], 6 +; CHECK-NEXT: ret i16 [[B]] +; + %a = mul i16 %x, 3 + %b = mul nsw i16 %a, 2 + ret i16 %b +} + +; Must not propagate nsw. + +define i16 @mul_nsw_reassoc_prop_no_nsw2(i16 %x) { +; CHECK-LABEL: @mul_nsw_reassoc_prop_no_nsw2( +; CHECK-NEXT: [[B:%.*]] = mul i16 [[X:%.*]], 6 +; CHECK-NEXT: ret i16 [[B]] +; + %a = mul nsw i16 %x, 3 + %b = mul i16 %a, 2 + ret i16 %b +} + +; Must not propagate nsw. + +define i16 @mul_nsw_reassoc_prop_overflow(i16 %x) { +; CHECK-LABEL: @mul_nsw_reassoc_prop_overflow( +; CHECK-NEXT: [[B:%.*]] = mul i16 [[X:%.*]], -31777 +; CHECK-NEXT: ret i16 [[B]] +; + %a = mul nsw i16 %x, 1023 + %b = mul nsw i16 %a, 33 + ret i16 %b +} From 320c2ee6c253f1bc0afe9c3d96cefb39195608f7 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 17 Jan 2025 10:09:26 +0100 Subject: [PATCH 09/45] [BOLT] Pass -Wl,--build-id=none to linker in tests (#122886) This fixes the following tests: BOLT :: AArch64/check-init-not-moved.s BOLT :: X86/dwarf5-dwarf4-types-backward-forward-cross-reference.test BOLT :: X86/dwarf5-locexpr-referrence.test When clang is compiled with `-DENABLE_LINKER_BUILD_ID=ON`. --- bolt/test/lit.local.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/test/lit.local.cfg b/bolt/test/lit.local.cfg index e2fa0a4a2210f..d5a6849b27a77 100644 --- a/bolt/test/lit.local.cfg +++ b/bolt/test/lit.local.cfg @@ -1,5 +1,5 @@ host_linux_triple = config.target_triple.split("-")[0] + "-unknown-linux-gnu" -common_linker_flags = "-fuse-ld=lld -Wl,--unresolved-symbols=ignore-all -pie" +common_linker_flags = "-fuse-ld=lld -Wl,--unresolved-symbols=ignore-all -Wl,--build-id=none -pie" flags = f"--target={host_linux_triple} -fPIE {common_linker_flags}" config.substitutions.insert(0, ("%cflags", f"%cflags {flags}")) From 58903c9b71ccb167ed1be4be9d9eddf1b2f07845 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 17 Jan 2025 10:21:54 +0100 Subject: [PATCH 10/45] [LLVM] Update AArch64 maintainers (#120440) This merges the maintainer lists for the ARM and AArch64 backends, as many people work on both to some degree. The list includes focus areas where possible. --- llvm/Maintainers.md | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md index e2af991ed37b1..10714b508ca68 100644 --- a/llvm/Maintainers.md +++ b/llvm/Maintainers.md @@ -169,10 +169,26 @@ rnk@google.com (email), [rnk](https://github.com/rnk) (GitHub) ### Backends / Targets -#### AArch64 backend +#### ARM and AArch64 backends -Tim Northover \ -t.p.northover@gmail.com (email), [TNorthover](https://github.com/TNorthover) (GitHub) +David Green \ +david.green@arm.com (email), [davemgreen](https://github.com/davemgreen) (GitHub) \ +Amara Emerson (esp. AArch64 GlobalISel) \ +amara@apple.com (email), [aemerson](https://github.com/aemerson) (GitHub) \ +Eli Friedman (esp. ARM64EC) \ +efriedma@quicinc.com (email), [efriedma-quic](https://github.com/efriedma-quic) (GitHub) \ +Sjoerd Meijer \ +smeijer@nvidia.com (email), [sjoerdmeijer](https://github.com/sjoerdmeijer) (GitHub) \ +Nashe Mncube \ +nashe.mncube@arm.com (email), [nasherm](https://github.com/nasherm) (GitHub) \ +Sander de Smalen (esp. scalable vectorization/SVE/SME) \ +sander.desmalen@arm.com (email), [sdesmalen-arm](https://github.com/sdesmalen-arm) (GitHub) \ +Peter Smith (Anything ABI) \ +peter.smith@arm.com (email), [smithp35](https://github.com/smithp35) (GitHub) \ +Oliver Stannard (esp. assembly/dissassembly) \ +oliver.stannard@arm.com (email), [ostannard](https://github.com/ostannard) (GitHub) \ +Ties Stuij (Arm GlobalISel and early arch support) \ +ties.stuij@arm.com (email), [stuij](https://github.com/stuij) (GitHub) #### AMDGPU backend @@ -184,19 +200,6 @@ Matthew.Arsenault@amd.com, arsenm2@gmail.com (email), [arsenm](https://github.co Mark Schimmel \ marksl@synopsys.com (email), [markschimmel](https://github.com/markschimmel) (GitHub) -#### ARM backend - -David Green \ -david.green@arm.com (email), [davemgreen](https://github.com/davemgreen) (GitHub) \ -Oliver Stannard (Especially assembly/dissassembly) \ -oliver.stannard@arm.com (email), [ostannard](https://github.com/ostannard) (GitHub) \ -Nashe Mncube \ -nashe.mncube@arm.com (email), [nasherm](https://github.com/nasherm) (GitHub) \ -Peter Smith (Anything ABI) \ -peter.smith@arm.com (email), [smithp35](https://github.com/smithp35) (GitHub) \ -Ties Stuij (GlobalISel and early arch support) \ -ties.stuij@arm.com (email), [stuij](https://github.com/stuij) (GitHub) - #### AVR backend Ben Shi \ @@ -480,6 +483,7 @@ James Grosbach (grosbach@apple.com) -- MC layer \ Anton Korobeynikov (anton@korobeynikov.info, [asl](https://github.com/asl)) -- ARM EABI, Windows codegen \ Benjamin Kramer (benny.kra@gmail.com, [d0k](https://github.com/d0k)) -- DWARF Parser \ David Majnemer (david.majnemer@gmail.com, [majnemer](https://github.com/majnemer)) -- InstCombine, ConstantFold \ +Tim Northover (t.p.northover@gmail.com, [TNorthover](https://github.com/TNorthover)) -- AArch64 backend \ Chad Rosier (mcrosier@codeaurora.org) -- FastISel \ Hans Wennborg (hans@chromium.org, [zmodem](https://github.com/zmodem)) -- Release management \ Kostya Serebryany ([kcc](https://github.com/kcc)) -- Sanitizers \ From 73478708839fad8b02b3cfc84959d64a15ba93ca Mon Sep 17 00:00:00 2001 From: Karl-Johan Karlsson Date: Fri, 17 Jan 2025 10:23:27 +0100 Subject: [PATCH 11/45] [diagtool] Make the BuiltinDiagnosticsByID table sorted (#120321) When building with -DLLVM_ENABLE_EXPENSIVE_CHECKS=ON with a recent libstdc++ (e.g. from gcc 13.3.0) the testcase clang/test/Misc/warning-flags-tree.c fail with the message: ``` + diagtool tree --internal .../include/c++/13.3.0/bits/stl_algo.h:2013: In function: _ForwardIterator std::lower_bound(_ForwardIterator, _ForwardIterator, const _Tp &, _Compare) [_ForwardIterator = const diagtool::DiagnosticRecord *, _Tp = diagtool::DiagnosticRecord, _Compare = bool (*)(const diagtool::DiagnosticRecord &, const diagtool::DiagnosticRecord &)] Error: elements in iterator range [first, last) are not partitioned by the predicate __comp and value __val. Objects involved in the operation: iterator "first" @ 0x7ffea8ef2fd8 { } iterator "last" @ 0x7ffea8ef2fd0 { } ``` The reason for this error is that std::lower_bound is called on BuiltinDiagnosticsByID without it being entirely sorted. Calling std::lower_bound If the range is not sorted, the behavior of this function is undefined. This is detected when building with expensive checks. To make BuiltinDiagnosticsByID sorted we need to slightly change the order the inc-files are included. The include of DiagnosticCrossTUKinds.inc in DiagnosticNames.cpp is included too early and should be moved down directly after DiagnosticCommentKinds.inc. As a part of pull request the includes that build up BuiltinDiagnosticsByID table are extracted into a common wrapper header file AllDiagnosticKinds.inc that is used by both clang and diagtool. --- .../clang/Basic/AllDiagnosticKinds.inc | 33 +++++++++++++ clang/lib/Basic/DiagnosticIDs.cpp | 48 ++----------------- clang/tools/diagtool/DiagnosticNames.cpp | 22 ++++----- 3 files changed, 44 insertions(+), 59 deletions(-) create mode 100644 clang/include/clang/Basic/AllDiagnosticKinds.inc diff --git a/clang/include/clang/Basic/AllDiagnosticKinds.inc b/clang/include/clang/Basic/AllDiagnosticKinds.inc new file mode 100644 index 0000000000000..a946b4a640ac6 --- /dev/null +++ b/clang/include/clang/Basic/AllDiagnosticKinds.inc @@ -0,0 +1,33 @@ +//===--- AllDiagnosticKinds.inc----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Defines the Diagnostic IDs in ID sorted order. The order is dictated by +/// the enum in DiagnosticIDs.h#L49-L65. +/// +//===----------------------------------------------------------------------===// + +// Turn off clang-format, as the order of the includes are important to make +// sure tables based on Diagnostic IDs are partitioned/sorted based on +// DiagID. + +// clang-format off +#include "clang/Basic/DiagnosticCommonKinds.inc" +#include "clang/Basic/DiagnosticDriverKinds.inc" +#include "clang/Basic/DiagnosticFrontendKinds.inc" +#include "clang/Basic/DiagnosticSerializationKinds.inc" +#include "clang/Basic/DiagnosticLexKinds.inc" +#include "clang/Basic/DiagnosticParseKinds.inc" +#include "clang/Basic/DiagnosticASTKinds.inc" +#include "clang/Basic/DiagnosticCommentKinds.inc" +#include "clang/Basic/DiagnosticCrossTUKinds.inc" +#include "clang/Basic/DiagnosticSemaKinds.inc" +#include "clang/Basic/DiagnosticAnalysisKinds.inc" +#include "clang/Basic/DiagnosticRefactoringKinds.inc" +#include "clang/Basic/DiagnosticInstallAPIKinds.inc" +// clang-format on diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp index d77f28c80b2eb..81194bbf2538e 100644 --- a/clang/lib/Basic/DiagnosticIDs.cpp +++ b/clang/lib/Basic/DiagnosticIDs.cpp @@ -37,21 +37,7 @@ struct StaticDiagInfoDescriptionStringTable { #define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR, \ SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY) \ char ENUM##_desc[sizeof(DESC)]; - // clang-format off -#include "clang/Basic/DiagnosticCommonKinds.inc" -#include "clang/Basic/DiagnosticDriverKinds.inc" -#include "clang/Basic/DiagnosticFrontendKinds.inc" -#include "clang/Basic/DiagnosticSerializationKinds.inc" -#include "clang/Basic/DiagnosticLexKinds.inc" -#include "clang/Basic/DiagnosticParseKinds.inc" -#include "clang/Basic/DiagnosticASTKinds.inc" -#include "clang/Basic/DiagnosticCommentKinds.inc" -#include "clang/Basic/DiagnosticCrossTUKinds.inc" -#include "clang/Basic/DiagnosticSemaKinds.inc" -#include "clang/Basic/DiagnosticAnalysisKinds.inc" -#include "clang/Basic/DiagnosticRefactoringKinds.inc" -#include "clang/Basic/DiagnosticInstallAPIKinds.inc" - // clang-format on +#include "clang/Basic/AllDiagnosticKinds.inc" #undef DIAG }; @@ -59,21 +45,7 @@ const StaticDiagInfoDescriptionStringTable StaticDiagInfoDescriptions = { #define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR, \ SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY) \ DESC, -// clang-format off -#include "clang/Basic/DiagnosticCommonKinds.inc" -#include "clang/Basic/DiagnosticDriverKinds.inc" -#include "clang/Basic/DiagnosticFrontendKinds.inc" -#include "clang/Basic/DiagnosticSerializationKinds.inc" -#include "clang/Basic/DiagnosticLexKinds.inc" -#include "clang/Basic/DiagnosticParseKinds.inc" -#include "clang/Basic/DiagnosticASTKinds.inc" -#include "clang/Basic/DiagnosticCommentKinds.inc" -#include "clang/Basic/DiagnosticCrossTUKinds.inc" -#include "clang/Basic/DiagnosticSemaKinds.inc" -#include "clang/Basic/DiagnosticAnalysisKinds.inc" -#include "clang/Basic/DiagnosticRefactoringKinds.inc" -#include "clang/Basic/DiagnosticInstallAPIKinds.inc" -// clang-format on +#include "clang/Basic/AllDiagnosticKinds.inc" #undef DIAG }; @@ -85,21 +57,7 @@ const uint32_t StaticDiagInfoDescriptionOffsets[] = { #define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR, \ SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY) \ offsetof(StaticDiagInfoDescriptionStringTable, ENUM##_desc), -// clang-format off -#include "clang/Basic/DiagnosticCommonKinds.inc" -#include "clang/Basic/DiagnosticDriverKinds.inc" -#include "clang/Basic/DiagnosticFrontendKinds.inc" -#include "clang/Basic/DiagnosticSerializationKinds.inc" -#include "clang/Basic/DiagnosticLexKinds.inc" -#include "clang/Basic/DiagnosticParseKinds.inc" -#include "clang/Basic/DiagnosticASTKinds.inc" -#include "clang/Basic/DiagnosticCommentKinds.inc" -#include "clang/Basic/DiagnosticCrossTUKinds.inc" -#include "clang/Basic/DiagnosticSemaKinds.inc" -#include "clang/Basic/DiagnosticAnalysisKinds.inc" -#include "clang/Basic/DiagnosticRefactoringKinds.inc" -#include "clang/Basic/DiagnosticInstallAPIKinds.inc" -// clang-format on +#include "clang/Basic/AllDiagnosticKinds.inc" #undef DIAG }; diff --git a/clang/tools/diagtool/DiagnosticNames.cpp b/clang/tools/diagtool/DiagnosticNames.cpp index eb90f082437b3..c3a3002889c73 100644 --- a/clang/tools/diagtool/DiagnosticNames.cpp +++ b/clang/tools/diagtool/DiagnosticNames.cpp @@ -23,26 +23,13 @@ llvm::ArrayRef diagtool::getBuiltinDiagnosticsByName() { return llvm::ArrayRef(BuiltinDiagnosticsByName); } - // FIXME: Is it worth having two tables, especially when this one can get // out of sync easily? static const DiagnosticRecord BuiltinDiagnosticsByID[] = { #define DIAG(ENUM, CLASS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR, \ SHOWINSYSHEADER, SHOWINSYSMACRO, DEFER, CATEGORY) \ {#ENUM, diag::ENUM, STR_SIZE(#ENUM, uint8_t)}, -#include "clang/Basic/DiagnosticCommonKinds.inc" -#include "clang/Basic/DiagnosticCrossTUKinds.inc" -#include "clang/Basic/DiagnosticDriverKinds.inc" -#include "clang/Basic/DiagnosticFrontendKinds.inc" -#include "clang/Basic/DiagnosticSerializationKinds.inc" -#include "clang/Basic/DiagnosticLexKinds.inc" -#include "clang/Basic/DiagnosticParseKinds.inc" -#include "clang/Basic/DiagnosticASTKinds.inc" -#include "clang/Basic/DiagnosticCommentKinds.inc" -#include "clang/Basic/DiagnosticSemaKinds.inc" -#include "clang/Basic/DiagnosticAnalysisKinds.inc" -#include "clang/Basic/DiagnosticRefactoringKinds.inc" -#include "clang/Basic/DiagnosticInstallAPIKinds.inc" +#include "clang/Basic/AllDiagnosticKinds.inc" #undef DIAG }; @@ -54,6 +41,13 @@ static bool orderByID(const DiagnosticRecord &Left, const DiagnosticRecord &diagtool::getDiagnosticForID(short DiagID) { DiagnosticRecord Key = {nullptr, DiagID, 0}; + // The requirement for lower_bound to produce a valid result it is + // enough if the BuiltinDiagnosticsByID is partitioned (by DiagID), + // but as we want this function to work for all possible values of + // DiagID sent in as argument it is better to right away check if + // BuiltinDiagnosticsByID is sorted. + assert(llvm::is_sorted(BuiltinDiagnosticsByID, orderByID) && + "IDs in BuiltinDiagnosticsByID must be sorted."); const DiagnosticRecord *Result = llvm::lower_bound(BuiltinDiagnosticsByID, Key, orderByID); assert(Result && "diagnostic not found; table may be out of date"); From 89e3a649f207021c0884ed5f8e56321c51854ac3 Mon Sep 17 00:00:00 2001 From: ZhaoQi Date: Fri, 17 Jan 2025 17:29:22 +0800 Subject: [PATCH 12/45] [LoongArch] Emit R_LARCH_RELAX when expanding some macros (#120067) Emit `R_LARCH_RELAX` relocations when expanding some macros, including: - `la.tls.ie`, `la.tls.ld`, `la.tls.gd`, `la.tls.desc`, - `call36`, `tail36`. Other macros that need to emit `R_LARCH_RELAX` relocations was implemented in https://github.com/llvm/llvm-project/pull/72961, including: - `la.local`, `la.pcrel`, `la.pcrel` expanded as `la.abs`, `la`, `la.global`, `la/la.global` expanded as `la.pcrel`, `la.got`. Note: `la.tls.le` macro can be relaxed when expanded with `R_LARCH_TLS_LE_{HI20/ADD/LO12}_R` relocations. But if we do so, previously handwritten assembly code will occur error due to the redundant `add.{w/d}` followed by `la.tls.le`. So `la.tls.le` keeps to expands with `R_LARCH_TLS_LE_{HI20/LO12}`. --- .../AsmParser/LoongArchAsmParser.cpp | 23 +++++--- llvm/test/MC/LoongArch/Macros/aliases-la.s | 52 +++++++++++++++++++ llvm/test/MC/LoongArch/Macros/macros-call.s | 17 ++++++ llvm/test/MC/LoongArch/Macros/macros-la.s | 20 +++++++ 4 files changed, 104 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp index efc8b77f8d8fa..420b98b8a9c1f 100644 --- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp +++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp @@ -1009,7 +1009,8 @@ void LoongArchAsmParser::emitLoadAddressPcrel(MCInst &Inst, SMLoc IDLoc, Insts.push_back( LoongArchAsmParser::Inst(ADDI, LoongArchMCExpr::VK_LoongArch_PCALA_LO12)); - emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, true); + emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, + /*RelaxHint=*/true); } void LoongArchAsmParser::emitLoadAddressPcrelLarge(MCInst &Inst, SMLoc IDLoc, @@ -1083,7 +1084,8 @@ void LoongArchAsmParser::emitLoadAddressGot(MCInst &Inst, SMLoc IDLoc, Insts.push_back( LoongArchAsmParser::Inst(LD, LoongArchMCExpr::VK_LoongArch_GOT_PC_LO12)); - emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, true); + emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, + /*RelaxHint=*/true); } void LoongArchAsmParser::emitLoadAddressGotLarge(MCInst &Inst, SMLoc IDLoc, @@ -1176,7 +1178,8 @@ void LoongArchAsmParser::emitLoadAddressTLSIE(MCInst &Inst, SMLoc IDLoc, Insts.push_back(LoongArchAsmParser::Inst( LD, LoongArchMCExpr::VK_LoongArch_TLS_IE_PC_LO12)); - emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out); + emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, + /*RelaxHint=*/true); } void LoongArchAsmParser::emitLoadAddressTLSIELarge(MCInst &Inst, SMLoc IDLoc, @@ -1248,7 +1251,8 @@ void LoongArchAsmParser::emitLoadAddressTLSLD(MCInst &Inst, SMLoc IDLoc, Insts.push_back(LoongArchAsmParser::Inst( ADDI, LoongArchMCExpr::VK_LoongArch_GOT_PC_LO12)); - emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out); + emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, + /*RelaxHint=*/true); } void LoongArchAsmParser::emitLoadAddressTLSLDLarge(MCInst &Inst, SMLoc IDLoc, @@ -1320,7 +1324,8 @@ void LoongArchAsmParser::emitLoadAddressTLSGD(MCInst &Inst, SMLoc IDLoc, Insts.push_back(LoongArchAsmParser::Inst( ADDI, LoongArchMCExpr::VK_LoongArch_GOT_PC_LO12)); - emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out); + emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, + /*RelaxHint=*/true); } void LoongArchAsmParser::emitLoadAddressTLSGDLarge(MCInst &Inst, SMLoc IDLoc, @@ -1409,7 +1414,8 @@ void LoongArchAsmParser::emitLoadAddressTLSDesc(MCInst &Inst, SMLoc IDLoc, Insts.push_back(LoongArchAsmParser::Inst( LoongArch::JIRL, LoongArchMCExpr::VK_LoongArch_TLS_DESC_CALL)); - emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out); + emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, + /*RelaxHint=*/true); } void LoongArchAsmParser::emitLoadAddressTLSDescLarge(MCInst &Inst, SMLoc IDLoc, @@ -1500,8 +1506,9 @@ void LoongArchAsmParser::emitFuncCall36(MCInst &Inst, SMLoc IDLoc, IsTailCall ? Inst.getOperand(0).getReg() : MCRegister(LoongArch::R1); const MCExpr *Sym = IsTailCall ? Inst.getOperand(1).getExpr() : Inst.getOperand(0).getExpr(); - const LoongArchMCExpr *LE = LoongArchMCExpr::create( - Sym, llvm::LoongArchMCExpr::VK_LoongArch_CALL36, getContext()); + const LoongArchMCExpr *LE = + LoongArchMCExpr::create(Sym, llvm::LoongArchMCExpr::VK_LoongArch_CALL36, + getContext(), /*RelaxHint=*/true); Out.emitInstruction( MCInstBuilder(LoongArch::PCADDU18I).addReg(ScratchReg).addExpr(LE), diff --git a/llvm/test/MC/LoongArch/Macros/aliases-la.s b/llvm/test/MC/LoongArch/Macros/aliases-la.s index dd5a4d474e001..1b5b818f4348f 100644 --- a/llvm/test/MC/LoongArch/Macros/aliases-la.s +++ b/llvm/test/MC/LoongArch/Macros/aliases-la.s @@ -3,13 +3,26 @@ # RUN: llvm-mc --triple=loongarch64 %s \ # RUN: | FileCheck %s --check-prefix=NORMAL +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s -o %t +# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=RELOC +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.relax +# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=RELOC,RELAX # RUN: llvm-mc --triple=loongarch64 --mattr=+la-global-with-pcrel < %s \ # RUN: | FileCheck %s --check-prefix=GTOPCR +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+la-global-with-pcrel \ +# RUN: --mattr=-relax %s -o %t +# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=GTOPCR-RELOC +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+la-global-with-pcrel \ +# RUN: --mattr=+relax %s -o %t.relax +# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=GTOPCR-RELOC,GTOPCR-RELAX # RUN: llvm-mc --triple=loongarch64 --mattr=+la-global-with-abs < %s \ # RUN: | FileCheck %s --check-prefix=GTOABS # RUN: llvm-mc --triple=loongarch64 --mattr=+la-local-with-abs < %s \ # RUN: | FileCheck %s --check-prefix=LTOABS +# RELOC: Relocations [ +# RELOC-NEXT: Section ({{.*}}) .rela.text { + la $a0, sym # NORMAL: pcalau12i $a0, %got_pc_hi20(sym) # NORMAL-NEXT: ld.d $a0, $a0, %got_pc_lo12(sym) @@ -22,6 +35,16 @@ la $a0, sym # GTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym) # GTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym) +# RELOC-NEXT: R_LARCH_GOT_PC_HI20 sym 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 +# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 + +# GTOPCR-RELOC: R_LARCH_PCALA_HI20 sym 0x0 +# GTOPCR-RELAX: R_LARCH_RELAX - 0x0 +# GTOPCR-RELOC-NEXT: R_LARCH_PCALA_LO12 sym 0x0 +# GTOPCR-RELAX-NEXT: R_LARCH_RELAX - 0x0 + la.global $a0, sym_global # NORMAL: pcalau12i $a0, %got_pc_hi20(sym_global) # NORMAL-NEXT: ld.d $a0, $a0, %got_pc_lo12(sym_global) @@ -34,6 +57,16 @@ la.global $a0, sym_global # GTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym_global) # GTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_global) +# RELOC-NEXT: R_LARCH_GOT_PC_HI20 sym_global 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 +# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_global 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 + +# GTOPCR-RELOC-NEXT: R_LARCH_PCALA_HI20 sym_global 0x0 +# GTOPCR-RELAX-NEXT: R_LARCH_RELAX - 0x0 +# GTOPCR-RELOC-NEXT: R_LARCH_PCALA_LO12 sym_global 0x0 +# GTOPCR-RELAX-NEXT: R_LARCH_RELAX - 0x0 + la.global $a0, $a1, sym_global_large # NORMAL: pcalau12i $a0, %got_pc_hi20(sym_global_large) # NORMAL-NEXT: addi.d $a1, $zero, %got_pc_lo12(sym_global_large) @@ -52,6 +85,11 @@ la.global $a0, $a1, sym_global_large # GTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym_global_large) # GTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_global_large) +# RELOC-NEXT: R_LARCH_GOT_PC_HI20 sym_global_large 0x0 +# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_global_large 0x0 +# RELOC-NEXT: R_LARCH_GOT64_PC_LO20 sym_global_large 0x0 +# RELOC-NEXT: R_LARCH_GOT64_PC_HI12 sym_global_large 0x0 + la.local $a0, sym_local # NORMAL: pcalau12i $a0, %pc_hi20(sym_local) # NORMAL-NEXT: addi.d $a0, $a0, %pc_lo12(sym_local) @@ -61,6 +99,11 @@ la.local $a0, sym_local # LTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym_local) # LTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_local) +# RELOC-NEXT: R_LARCH_PCALA_HI20 sym_local 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 +# RELOC-NEXT: R_LARCH_PCALA_LO12 sym_local 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 + la.local $a0, $a1, sym_local_large # NORMAL: pcalau12i $a0, %pc_hi20(sym_local_large) # NORMAL-NEXT: addi.d $a1, $zero, %pc_lo12(sym_local_large) @@ -72,3 +115,12 @@ la.local $a0, $a1, sym_local_large # LTOABS-NEXT: ori $a0, $a0, %abs_lo12(sym_local_large) # LTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym_local_large) # LTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_local_large) + +# RELOC-NEXT: R_LARCH_PCALA_HI20 sym_local_large 0x0 +# RELOC-NEXT: R_LARCH_PCALA_LO12 sym_local_large 0x0 +# RELOC-NEXT: R_LARCH_PCALA64_LO20 sym_local_large 0x0 +# RELOC-NEXT: R_LARCH_PCALA64_HI12 sym_local_large 0x0 + + +# RELOC-NEXT: } +# RELOC-NEXT: ] diff --git a/llvm/test/MC/LoongArch/Macros/macros-call.s b/llvm/test/MC/LoongArch/Macros/macros-call.s index a648a39780381..df7715050a0f9 100644 --- a/llvm/test/MC/LoongArch/Macros/macros-call.s +++ b/llvm/test/MC/LoongArch/Macros/macros-call.s @@ -1,9 +1,26 @@ # RUN: llvm-mc --triple=loongarch64 %s | FileCheck %s +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s -o %t +# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=RELOC +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.relax +# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=RELOC,RELAX + +# RELOC: Relocations [ +# RELOC-NEXT: Section ({{.*}}) .rela.text { call36 sym_call # CHECK: pcaddu18i $ra, %call36(sym_call) # CHECK-NEXT: jirl $ra, $ra, 0 +# RELOC-NEXT: R_LARCH_CALL36 sym_call 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 + tail36 $t0, sym_tail # CHECK: pcaddu18i $t0, %call36(sym_tail) # CHECK-NEXT: jr $t0 + +# RELOC-NEXT: R_LARCH_CALL36 sym_tail 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 + + +# RELOC-NEXT: } +# RELOC-NEXT: ] diff --git a/llvm/test/MC/LoongArch/Macros/macros-la.s b/llvm/test/MC/LoongArch/Macros/macros-la.s index d4272b93ba54d..a732988ef1f1a 100644 --- a/llvm/test/MC/LoongArch/Macros/macros-la.s +++ b/llvm/test/MC/LoongArch/Macros/macros-la.s @@ -5,6 +5,12 @@ # RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=RELOC,RELAX # RUN: llvm-mc --triple=loongarch64 --mattr=+la-global-with-abs \ # RUN: %s | FileCheck %s --check-prefix=ABS +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+la-global-with-abs \ +# RUN: --mattr=-relax %s -o %t +# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=GTOABS-RELOC +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+la-global-with-abs \ +# RUN: --mattr=+relax %s -o %t.relax +# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=GTOABS-RELOC,GTOABS-RELAX # RELOC: Relocations [ # RELOC-NEXT: Section ({{.*}}) .rela.text { @@ -36,6 +42,10 @@ la.pcrel $a0, sym_pcrel # RELAX-NEXT: R_LARCH_RELAX - 0x0 # RELOC-NEXT: R_LARCH_PCALA_LO12 sym_pcrel 0x0 # RELAX-NEXT: R_LARCH_RELAX - 0x0 +# GTOABS-RELOC: R_LARCH_PCALA_HI20 sym_pcrel 0x0 +# GTOABS-RELAX-NEXT: R_LARCH_RELAX - 0x0 +# GTOABS-RELOC-NEXT: R_LARCH_PCALA_LO12 sym_pcrel 0x0 +# GTOABS-RELAX-NEXT: R_LARCH_RELAX - 0x0 la.got $a0, sym_got # CHECK-NEXT: pcalau12i $a0, %got_pc_hi20(sym_got) @@ -73,7 +83,9 @@ la.tls.ie $a0, sym_ie # ABS-NEXT: ld.d $a0, $a0, 0 # ABS-EMPTY: # RELOC-NEXT: R_LARCH_TLS_IE_PC_HI20 sym_ie 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 # RELOC-NEXT: R_LARCH_TLS_IE_PC_LO12 sym_ie 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 la.tls.ld $a0, sym_ld # CHECK-NEXT: pcalau12i $a0, %ld_pc_hi20(sym_ld) @@ -85,7 +97,9 @@ la.tls.ld $a0, sym_ld # ABS-NEXT: lu52i.d $a0, $a0, %got64_hi12(sym_ld) # ABS-EMPTY: # RELOC-NEXT: R_LARCH_TLS_LD_PC_HI20 sym_ld 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 # RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_ld 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 la.tls.gd $a0, sym_gd # CHECK-NEXT: pcalau12i $a0, %gd_pc_hi20(sym_gd) @@ -97,7 +111,9 @@ la.tls.gd $a0, sym_gd # ABS-NEXT: lu52i.d $a0, $a0, %got64_hi12(sym_gd) # ABS-EMPTY: # RELOC-NEXT: R_LARCH_TLS_GD_PC_HI20 sym_gd 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 # RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_gd 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 la.tls.desc $a0, sym_desc # CHECK-NEXT: pcalau12i $a0, %desc_pc_hi20(sym_desc) @@ -113,9 +129,13 @@ la.tls.desc $a0, sym_desc # ABS-NEXT: jirl $ra, $ra, %desc_call(sym_desc) # ABS-EMPTY: # RELOC-NEXT: R_LARCH_TLS_DESC_PC_HI20 sym_desc 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 # RELOC-NEXT: R_LARCH_TLS_DESC_PC_LO12 sym_desc 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 # RELOC-NEXT: R_LARCH_TLS_DESC_LD sym_desc 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 # RELOC-NEXT: R_LARCH_TLS_DESC_CALL sym_desc 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 ############################################################# ## with a temporary register. From 31b62e2d3df86487e7443608b5a84df754b571fd Mon Sep 17 00:00:00 2001 From: ZhaoQi Date: Fri, 17 Jan 2025 17:30:57 +0800 Subject: [PATCH 13/45] [LoongArch] Add relax relocations for tls_le code sequence (#121329) This commit add relax relocations for `tls_le` code sequence. Handwritten assembly and generating source code by clang are both affected. Scheduled `tls_le` code sequence can be relaxed normally and we can add relax relocs when code emitting according to their relocs. Other relaxable macros' code sequence cannot simply add relax relocs according to their relocs, such as `PCALA_{HI20/LO12}`, we do not want to add relax relocs when code model is large. This will be implemented in later commit. --- .../MCTargetDesc/LoongArchMCCodeEmitter.cpp | 10 ++- .../MC/LoongArch/Relocations/relax-tls-le.s | 70 +++++++++++++++++++ 2 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 llvm/test/MC/LoongArch/Relocations/relax-tls-le.s diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp index 359bde1244429..04d57f0fe7457 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp @@ -282,9 +282,11 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO, break; case LoongArchMCExpr::VK_LoongArch_TLS_LE_HI20_R: FixupKind = LoongArch::fixup_loongarch_tls_le_hi20_r; + RelaxCandidate = true; break; case LoongArchMCExpr::VK_LoongArch_TLS_LE_LO12_R: FixupKind = LoongArch::fixup_loongarch_tls_le_lo12_r; + RelaxCandidate = true; break; case LoongArchMCExpr::VK_LoongArch_PCREL20_S2: FixupKind = LoongArch::fixup_loongarch_pcrel20_s2; @@ -387,11 +389,17 @@ void LoongArchMCCodeEmitter::expandAddTPRel(const MCInst &MI, "Expected %le_add_r relocation on TP-relative symbol"); // Emit the correct %le_add_r relocation for the symbol. - // TODO: Emit R_LARCH_RELAX for %le_add_r where the relax feature is enabled. Fixups.push_back(MCFixup::create( 0, Expr, MCFixupKind(LoongArch::fixup_loongarch_tls_le_add_r), MI.getLoc())); + // Emit R_LARCH_RELAX for %le_add_r when the relax feature is enabled. + if (STI.hasFeature(LoongArch::FeatureRelax)) { + const MCConstantExpr *Dummy = MCConstantExpr::create(0, Ctx); + Fixups.push_back(MCFixup::create( + 0, Dummy, MCFixupKind(LoongArch::fixup_loongarch_relax), MI.getLoc())); + } + // Emit a normal ADD instruction with the given operands. unsigned ADD = MI.getOpcode() == LoongArch::PseudoAddTPRel_D ? LoongArch::ADD_D diff --git a/llvm/test/MC/LoongArch/Relocations/relax-tls-le.s b/llvm/test/MC/LoongArch/Relocations/relax-tls-le.s new file mode 100644 index 0000000000000..899f12f85654d --- /dev/null +++ b/llvm/test/MC/LoongArch/Relocations/relax-tls-le.s @@ -0,0 +1,70 @@ +# RUN: llvm-mc --filetype=obj --triple=loongarch32 --mattr=+relax < %s \ +# RUN: | llvm-readobj -r - | FileCheck --check-prefix=LA32-RELAX-RELOC %s +# RUN: llvm-mc --filetype=obj --triple=loongarch32 --mattr=-relax < %s \ +# RUN: | llvm-readobj -r - | FileCheck --check-prefix=LA32-NORELAX-RELOC %s +# RUN: llvm-mc --triple=loongarch32 --mattr=+relax < %s --show-encoding \ +# RUN: | FileCheck --check-prefix=LA32-RELAX-FIXUP %s + +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax --defsym=LA64=1 < %s \ +# RUN: | llvm-readobj -r - | FileCheck --check-prefix=LA64-RELAX-RELOC %s +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax --defsym=LA64=1 < %s \ +# RUN: | llvm-readobj -r - | FileCheck --check-prefix=LA64-NORELAX-RELOC %s +# RUN: llvm-mc --triple=loongarch64 --mattr=+relax --defsym=LA64=1 < %s --show-encoding \ +# RUN: | FileCheck --check-prefix=LA64-RELAX-FIXUP %s + +.long foo + +.ifndef LA64 + +lu12i.w $a0, %le_hi20_r(foo) +# LA32-NORELAX-RELOC: R_LARCH_TLS_LE_HI20_R foo 0x0 +# LA32-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0 +# LA32-RELAX-RELOC: R_LARCH_TLS_LE_HI20_R foo 0x0 +# LA32-RELAX-RELOC: R_LARCH_RELAX - 0x0 +# LA32-RELAX-FIXUP: fixup A - offset: 0, value: %le_hi20_r(foo), kind: FK_NONE +# LA32-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE + +add.w $a0, $a0, $tp, %le_add_r(foo) +# LA32-NORELAX-RELOC: R_LARCH_TLS_LE_ADD_R foo 0x0 +# LA32-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0 +# LA32-RELAX-RELOC: R_LARCH_TLS_LE_ADD_R foo 0x0 +# LA32-RELAX-RELOC: R_LARCH_RELAX - 0x0 +# LA32-RELAX-FIXUP: fixup A - offset: 0, value: %le_add_r(foo), kind: FK_NONE +# LA32-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE + +addi.w $a0, $a0, %le_lo12_r(foo) +# LA32-NORELAX-RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0 +# LA32-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0 +# LA32-RELAX-RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0 +# LA32-RELAX-RELOC: R_LARCH_RELAX - 0x0 +# LA32-RELAX-FIXUP: fixup A - offset: 0, value: %le_lo12_r(foo), kind: FK_NONE +# LA32-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE + +.else + +lu12i.w $a0, %le_hi20_r(foo) +# LA64-NORELAX-RELOC: R_LARCH_TLS_LE_HI20_R foo 0x0 +# LA64-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0 +# LA64-RELAX-RELOC: R_LARCH_TLS_LE_HI20_R foo 0x0 +# LA64-RELAX-RELOC: R_LARCH_RELAX - 0x0 +# LA64-RELAX-FIXUP: fixup A - offset: 0, value: %le_hi20_r(foo), kind: FK_NONE +# LA64-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE + +add.d $a0, $a0, $tp, %le_add_r(foo) +# LA64-NORELAX-RELOC: R_LARCH_TLS_LE_ADD_R foo 0x0 +# LA64-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0 +# LA64-RELAX-RELOC: R_LARCH_TLS_LE_ADD_R foo 0x0 +# LA64-RELAX-RELOC: R_LARCH_RELAX - 0x0 +# LA64-RELAX-FIXUP: fixup A - offset: 0, value: %le_add_r(foo), kind: FK_NONE +# LA64-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE + +addi.d $a0, $a0, %le_lo12_r(foo) +# LA64-NORELAX-RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0 +# LA64-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0 +# LA64-RELAX-RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0 +# LA64-RELAX-RELOC: R_LARCH_RELAX - 0x0 +# LA64-RELAX-FIXUP: fixup A - offset: 0, value: %le_lo12_r(foo), kind: FK_NONE +# LA64-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE + +.endif + From 30e276d06d3176f145151cea96ab01af0c3e842a Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Fri, 17 Jan 2025 09:35:02 +0000 Subject: [PATCH 14/45] [clang][PCH] Don't try to create standalone debug-info for types marked nodebug (#123253) Fixes one of the crashes uncovered by https://github.com/llvm/llvm-project/pull/118710 `getOrCreateStandaloneType` asserts that a `DIType` was created for the requested type. If the `Decl` was marked `nodebug`, however, we can't generate debug-info for it, so we would previously trigger the assert. For now keep the assertion around and check the `nodebug` at the callsite. --- clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp | 3 +++ clang/test/Modules/gmodules-nodebug.cpp | 14 ++++++++++++++ 2 files changed, 17 insertions(+) create mode 100644 clang/test/Modules/gmodules-nodebug.cpp diff --git a/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp b/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp index 5447b98d7105e..02635ce235a12 100644 --- a/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp +++ b/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp @@ -81,6 +81,9 @@ class PCHContainerGenerator : public ASTConsumer { if (!TD->isCompleteDefinition()) return true; + if (D->hasAttr()) + return true; + QualType QualTy = Ctx.getTypeDeclType(D); if (!QualTy.isNull() && CanRepresent(QualTy.getTypePtr())) DI.getOrCreateStandaloneType(QualTy, D->getLocation()); diff --git a/clang/test/Modules/gmodules-nodebug.cpp b/clang/test/Modules/gmodules-nodebug.cpp new file mode 100644 index 0000000000000..d83103768e838 --- /dev/null +++ b/clang/test/Modules/gmodules-nodebug.cpp @@ -0,0 +1,14 @@ +// REQUIRES: asserts + +// RUN: %clang_cc1 -std=c++23 -x c++-header -emit-pch -fmodule-format=obj \ +// RUN: -o %t.pch %s \ +// RUN: -mllvm -debug-only=pchcontainer &>%t-pch.ll +// RUN: cat %t-pch.ll | FileCheck %s + +template +using __void_t [[gnu::nodebug]] = void; + +__void_t<> func() {} + +// CHECK: !DICompileUnit +// CHECK-NOT: __void_t From d7e48fbf205a01fcbc109b2555b12aa0d37845a4 Mon Sep 17 00:00:00 2001 From: NimishMishra <42909663+NimishMishra@users.noreply.github.com> Date: Fri, 17 Jan 2025 15:10:33 +0530 Subject: [PATCH 15/45] [llvm][OpenMP] Add implicit cast to omp.atomic.read (#114659) Should the operands of `omp.atomic.read` differ, emit an implicit cast. In case of `struct` arguments, extract the 0-th index, emit an implicit cast if required, and store at the destination. Fixes https://github.com/llvm/llvm-project/issues/112908 --- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 31 ++++++++++ mlir/test/Target/LLVMIR/openmp-llvm.mlir | 71 +++++++++++++++++++++++ 2 files changed, 102 insertions(+) diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 188a450d12fde..7dbf65fbf055b 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -264,6 +264,33 @@ computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, return Result; } +/// Emit an implicit cast to convert \p XRead to type of variable \p V +static llvm::Value *emitImplicitCast(IRBuilder<> &Builder, llvm::Value *XRead, + llvm::Value *V) { + // TODO: Add this functionality to the `AtomicInfo` interface + llvm::Type *XReadType = XRead->getType(); + llvm::Type *VType = V->getType(); + if (llvm::AllocaInst *vAlloca = dyn_cast(V)) + VType = vAlloca->getAllocatedType(); + + if (XReadType->isStructTy() && VType->isStructTy()) + // No need to extract or convert. A direct + // `store` will suffice. + return XRead; + + if (XReadType->isStructTy()) + XRead = Builder.CreateExtractValue(XRead, /*Idxs=*/0); + if (VType->isIntegerTy() && XReadType->isFloatingPointTy()) + XRead = Builder.CreateFPToSI(XRead, VType); + else if (VType->isFloatingPointTy() && XReadType->isIntegerTy()) + XRead = Builder.CreateSIToFP(XRead, VType); + else if (VType->isIntegerTy() && XReadType->isIntegerTy()) + XRead = Builder.CreateIntCast(XRead, VType, true); + else if (VType->isFloatingPointTy() && XReadType->isFloatingPointTy()) + XRead = Builder.CreateFPCast(XRead, VType); + return XRead; +} + /// Make \p Source branch to \p Target. /// /// Handles two situations: @@ -8501,6 +8528,8 @@ OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc, } } checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read); + if (XRead->getType() != V.Var->getType()) + XRead = emitImplicitCast(Builder, XRead, V.Var); Builder.CreateStore(XRead, V.Var, V.IsVolatile); return Builder.saveIP(); } @@ -8785,6 +8814,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture( return AtomicResult.takeError(); Value *CapturedVal = (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second); + if (CapturedVal->getType() != V.Var->getType()) + CapturedVal = emitImplicitCast(Builder, CapturedVal, V.Var); Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile); checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture); diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir index 390ecabaef21b..4e4b9e5698fe9 100644 --- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir +++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir @@ -1368,6 +1368,77 @@ llvm.func @omp_atomic_read(%arg0 : !llvm.ptr, %arg1 : !llvm.ptr) -> () { // ----- +// CHECK-LABEL: @omp_atomic_read_implicit_cast +llvm.func @omp_atomic_read_implicit_cast () { +//CHECK: %[[Z:.*]] = alloca float, i64 1, align 4 +//CHECK: %[[Y:.*]] = alloca double, i64 1, align 8 +//CHECK: %[[X:.*]] = alloca [2 x { float, float }], i64 1, align 8 +//CHECK: %[[W:.*]] = alloca i32, i64 1, align 4 +//CHECK: %[[X_ELEMENT:.*]] = getelementptr { float, float }, ptr %3, i64 0 + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x f32 {bindc_name = "z"} : (i64) -> !llvm.ptr + %2 = llvm.mlir.constant(1 : i64) : i64 + %3 = llvm.alloca %2 x f64 {bindc_name = "y"} : (i64) -> !llvm.ptr + %4 = llvm.mlir.constant(1 : i64) : i64 + %5 = llvm.alloca %4 x !llvm.array<2 x struct<(f32, f32)>> {bindc_name = "x"} : (i64) -> !llvm.ptr + %6 = llvm.mlir.constant(1 : i64) : i64 + %7 = llvm.alloca %6 x i32 {bindc_name = "w"} : (i64) -> !llvm.ptr + %8 = llvm.mlir.constant(1 : index) : i64 + %9 = llvm.mlir.constant(2 : index) : i64 + %10 = llvm.mlir.constant(1 : i64) : i64 + %11 = llvm.mlir.constant(0 : i64) : i64 + %12 = llvm.sub %8, %10 overflow : i64 + %13 = llvm.mul %12, %10 overflow : i64 + %14 = llvm.mul %13, %10 overflow : i64 + %15 = llvm.add %14, %11 overflow : i64 + %16 = llvm.mul %10, %9 overflow : i64 + %17 = llvm.getelementptr %5[%15] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(f32, f32)> + +//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = alloca { float, float }, align 8 +//CHECK: call void @__atomic_load(i64 8, ptr %[[X_ELEMENT]], ptr %[[ATOMIC_LOAD_TEMP]], i32 0) +//CHECK: %[[LOAD:.*]] = load { float, float }, ptr %[[ATOMIC_LOAD_TEMP]], align 8 +//CHECK: %[[EXT:.*]] = extractvalue { float, float } %[[LOAD]], 0 +//CHECK: store float %[[EXT]], ptr %[[Y]], align 4 + omp.atomic.read %3 = %17 : !llvm.ptr, !llvm.ptr, !llvm.struct<(f32, f32)> + +//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i32, ptr %[[Z]] monotonic, align 4 +//CHECK: %[[CAST:.*]] = bitcast i32 %[[ATOMIC_LOAD_TEMP]] to float +//CHECK: %[[LOAD:.*]] = fpext float %[[CAST]] to double +//CHECK: store double %[[LOAD]], ptr %[[Y]], align 8 + omp.atomic.read %3 = %1 : !llvm.ptr, !llvm.ptr, f32 + +//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i32, ptr %[[W]] monotonic, align 4 +//CHECK: %[[LOAD:.*]] = sitofp i32 %[[ATOMIC_LOAD_TEMP]] to double +//CHECK: store double %[[LOAD]], ptr %[[Y]], align 8 + omp.atomic.read %3 = %7 : !llvm.ptr, !llvm.ptr, i32 + +//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i64, ptr %[[Y]] monotonic, align 4 +//CHECK: %[[CAST:.*]] = bitcast i64 %[[ATOMIC_LOAD_TEMP]] to double +//CHECK: %[[LOAD:.*]] = fptrunc double %[[CAST]] to float +//CHECK: store float %[[LOAD]], ptr %[[Z]], align 4 + omp.atomic.read %1 = %3 : !llvm.ptr, !llvm.ptr, f64 + +//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i32, ptr %[[W]] monotonic, align 4 +//CHECK: %[[LOAD:.*]] = sitofp i32 %[[ATOMIC_LOAD_TEMP]] to float +//CHECK: store float %[[LOAD]], ptr %[[Z]], align 4 + omp.atomic.read %1 = %7 : !llvm.ptr, !llvm.ptr, i32 + +//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i64, ptr %[[Y]] monotonic, align 4 +//CHECK: %[[CAST:.*]] = bitcast i64 %[[ATOMIC_LOAD_TEMP]] to double +//CHECK: %[[LOAD:.*]] = fptosi double %[[CAST]] to i32 +//CHECK: store i32 %[[LOAD]], ptr %[[W]], align 4 + omp.atomic.read %7 = %3 : !llvm.ptr, !llvm.ptr, f64 + +//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i32, ptr %[[Z]] monotonic, align 4 +//CHECK: %[[CAST:.*]] = bitcast i32 %[[ATOMIC_LOAD_TEMP]] to float +//CHECK: %[[LOAD:.*]] = fptosi float %[[CAST]] to i32 +//CHECK: store i32 %[[LOAD]], ptr %[[W]], align 4 + omp.atomic.read %7 = %1 : !llvm.ptr, !llvm.ptr, f32 + llvm.return +} + +// ----- + // CHECK-LABEL: @omp_atomic_write // CHECK-SAME: (ptr %[[x:.*]], i32 %[[expr:.*]]) llvm.func @omp_atomic_write(%x: !llvm.ptr, %expr: i32) -> () { From fbb9d49506baa05a613ab88f983d31e0f838dbae Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Fri, 17 Jan 2025 17:51:42 +0800 Subject: [PATCH 16/45] [X86][APX] Support APX + AMX-MOVRS/AMX-TRANSPOSE (#123267) Ref.: https://cdrdv2.intel.com/v1/dl/getContent/784266 --- llvm/lib/Target/X86/X86ExpandPseudo.cpp | 20 +-- llvm/lib/Target/X86/X86ISelLowering.cpp | 24 +-- llvm/lib/Target/X86/X86InstrAMX.td | 75 +++++---- llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll | 89 +++++++++++ .../X86/amx_movrs_transpose_intrinsics.ll | 30 ++++ .../CodeGen/X86/amx_transpose_intrinsics.ll | 146 ++++++++++++++++++ .../Disassembler/X86/AMX/x86-64-amx-movrs.txt | 96 ++++++++++++ .../MC/Disassembler/X86/amx-transpose-att.txt | 48 ++++++ llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s | 90 ++++++++++- llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s | 96 ++++++++++++ llvm/test/MC/X86/amx-transpose-att.s | 48 ++++++ llvm/test/MC/X86/amx-transpose-intel.s | 48 ++++++ llvm/test/TableGen/x86-instr-mapping.inc | 10 ++ 13 files changed, 758 insertions(+), 62 deletions(-) diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index fc8a0eaed140d..7fbba7f05e0a5 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -578,10 +578,10 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, unsigned Opc; switch (Opcode) { case X86::PTILELOADDRSV: - Opc = X86::TILELOADDRS; + Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS); break; case X86::PTILELOADDRST1V: - Opc = X86::TILELOADDRST1; + Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1); break; case X86::PTILELOADDV: Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD); @@ -737,28 +737,28 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, unsigned Opc; switch (Opcode) { case X86::PT2RPNTLVWZ0V: - Opc = X86::T2RPNTLVWZ0; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0); break; case X86::PT2RPNTLVWZ0T1V: - Opc = X86::T2RPNTLVWZ0T1; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1); break; case X86::PT2RPNTLVWZ1V: - Opc = X86::T2RPNTLVWZ1; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1); break; case X86::PT2RPNTLVWZ1T1V: - Opc = X86::T2RPNTLVWZ1T1; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1); break; case X86::PT2RPNTLVWZ0RSV: - Opc = X86::T2RPNTLVWZ0RS; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS); break; case X86::PT2RPNTLVWZ0RST1V: - Opc = X86::T2RPNTLVWZ0RST1; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1); break; case X86::PT2RPNTLVWZ1RSV: - Opc = X86::T2RPNTLVWZ1RS; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS); break; case X86::PT2RPNTLVWZ1RST1V: - Opc = X86::T2RPNTLVWZ1RST1; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1); break; default: llvm_unreachable("Impossible Opcode!"); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 90e3e15b1fb46..6d69665c17565 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37800,14 +37800,14 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::PTILESTORED: Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED); break; -#undef GET_EGPR_IF_ENABLED case X86::PTILELOADDRS: - Opc = X86::TILELOADDRS; + Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS); break; case X86::PTILELOADDRST1: - Opc = X86::TILELOADDRST1; + Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1); break; } +#undef GET_EGPR_IF_ENABLED MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc)); unsigned CurOp = 0; @@ -37838,34 +37838,36 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::PT2RPNTLVWZ1RST1: { const DebugLoc &DL = MI.getDebugLoc(); unsigned Opc; +#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC) switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected instruction!"); case X86::PT2RPNTLVWZ0: - Opc = X86::T2RPNTLVWZ0; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0); break; case X86::PT2RPNTLVWZ0T1: - Opc = X86::T2RPNTLVWZ0T1; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1); break; case X86::PT2RPNTLVWZ1: - Opc = X86::T2RPNTLVWZ1; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1); break; case X86::PT2RPNTLVWZ1T1: - Opc = X86::T2RPNTLVWZ1T1; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1); break; case X86::PT2RPNTLVWZ0RS: - Opc = X86::T2RPNTLVWZ0RS; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS); break; case X86::PT2RPNTLVWZ0RST1: - Opc = X86::T2RPNTLVWZ0RST1; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1); break; case X86::PT2RPNTLVWZ1RS: - Opc = X86::T2RPNTLVWZ1RS; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS); break; case X86::PT2RPNTLVWZ1RST1: - Opc = X86::T2RPNTLVWZ1RST1; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1); break; } +#undef GET_EGPR_IF_ENABLED MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define); diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index a055ba91d3e17..85046228bc8c5 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -345,26 +345,33 @@ let Predicates = [HasAMXTILE, In64BitMode], isPseudo = true, SchedRW = [WriteSys def PTILEPAIRLOAD : PseudoI<(outs TILEPair:$dst), (ins opaquemem:$src), []>; } -let Predicates = [HasAMXTRANSPOSE, In64BitMode] in { - let SchedRW = [WriteSystem] in { - def T2RPNTLVWZ0 : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst), - (ins sibmem:$src), "t2rpntlvwz0\t{$src, $dst|$dst, $src}", - []>, VEX, WIG, T8,PS; +multiclass T2RPNTLVW_Base op1, bits<8> op2, string rs, string suffix> { + def Z0#rs#suffix : I, PS; + def Z0#rs#T1#suffix : I, PS; + def Z1#rs#suffix : I, PD; + def Z1#rs#T1#suffix : I, PD; +} - def T2RPNTLVWZ0T1 : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst), - (ins sibmem:$src), "t2rpntlvwz0t1\t{$src, $dst|$dst, $src}", - []>, VEX, T8,PS; +let Predicates = [HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in + defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "">, T8, VEX; - def T2RPNTLVWZ1 : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst), - (ins sibmem:$src), "t2rpntlvwz1\t{$src, $dst|$dst, $src}", - []>, VEX, T8,PD; +let Predicates = [HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in + defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "_EVEX">, T8, EVEX, NoCD8; - def T2RPNTLVWZ1T1 : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst), - (ins sibmem:$src), "t2rpntlvwz1t1\t{$src, $dst|$dst, $src}", - []>, VEX, T8,PD; +let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in + defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "">, T_MAP5, VEX; +let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in + defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "_EVEX">, T_MAP5, EVEX, NoCD8; + +let Predicates = [HasAMXTRANSPOSE, In64BitMode] in { + let SchedRW = [WriteSystem] in { def TTRANSPOSED : I<0x5f, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src), - "ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8,XS; + "ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8, XS; let isPseudo = true in { def PT2RPNTLVWZ0V : PseudoI<(outs TILEPair:$dst), (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), @@ -491,22 +498,6 @@ let Predicates = [HasAMXCOMPLEX, HasAMXTRANSPOSE, In64BitMode], SchedRW = [Write } let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in { - def T2RPNTLVWZ0RS : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst), - (ins sibmem:$src1), - "t2rpntlvwz0rs\t{$src1, $dst|$dst, $src1}", - []>, VEX, T_MAP5; - def T2RPNTLVWZ0RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst), - (ins sibmem:$src1), - "t2rpntlvwz0rst1\t{$src1, $dst|$dst, $src1}", - []>, VEX, T_MAP5; - def T2RPNTLVWZ1RS : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst), - (ins sibmem:$src1), - "t2rpntlvwz1rs\t{$src1, $dst|$dst, $src1}", - []>, VEX, T_MAP5, PD; - def T2RPNTLVWZ1RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst), - (ins sibmem:$src1), - "t2rpntlvwz1rst1\t{$src1, $dst|$dst, $src1}", - []>, VEX, T_MAP5, PD; let isPseudo = true in { def PT2RPNTLVWZ0RSV : PseudoI<(outs TILEPair:$dst), (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), @@ -529,16 +520,20 @@ let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSy } } // HasAMXMOVRS, HasAMXTRANSPOSE -let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in { - def TILELOADDRS : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), - (ins sibmem:$src1), - "tileloaddrs\t{$src1, $dst|$dst, $src1}", - []>, VEX, T8, XD; - def TILELOADDRST1 : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), - (ins sibmem:$src1), - "tileloaddrst1\t{$src1, $dst|$dst, $src1}", - []>, VEX, T8, PD; +multiclass TILELOADDRS_Base { + def suffix : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src1), + "tileloaddrs\t{$src1, $dst|$dst, $src1}", []>, T8, XD; + def T1#suffix : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src1), + "tileloaddrst1\t{$src1, $dst|$dst, $src1}", []>, T8, PD; +} + +let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in + defm TILELOADDRS : TILELOADDRS_Base<"">, VEX; +let Predicates = [HasAMXMOVRS, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in + defm TILELOADDRS : TILELOADDRS_Base<"_EVEX">, EVEX, NoCD8; + +let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in { let isPseudo = true, mayLoad = 1 in { def PTILELOADDRSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, diff --git a/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll index da212a1850964..1b93ae029f27b 100755 --- a/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll +++ b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-movrs | FileCheck %s +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-movrs,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR define void @test_amx_internal(i16 %m, i16 %n, ptr %buf, i64 %s) { ; CHECK-LABEL: test_amx_internal: @@ -35,6 +36,44 @@ define void @test_amx_internal(i16 %m, i16 %n, ptr %buf, i64 %s) { ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 ; CHECK-NEXT: tilerelease ; CHECK-NEXT: retq +; +; EGPR-LABEL: test_amx_internal: +; EGPR: # %bb.0: # %entry +; EGPR-NEXT: pushq %rbp # encoding: [0x55] +; EGPR-NEXT: .cfi_def_cfa_offset 16 +; EGPR-NEXT: .cfi_offset %rbp, -16 +; EGPR-NEXT: movq %rsp, %rbp # encoding: [0x48,0x89,0xe5] +; EGPR-NEXT: .cfi_def_cfa_register %rbp +; EGPR-NEXT: andq $-1024, %rsp # encoding: [0x48,0x81,0xe4,0x00,0xfc,0xff,0xff] +; EGPR-NEXT: # imm = 0xFC00 +; EGPR-NEXT: subq $3072, %rsp # encoding: [0x48,0x81,0xec,0x00,0x0c,0x00,0x00] +; EGPR-NEXT: # imm = 0xC00 +; EGPR-NEXT: xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0] +; EGPR-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xc0,0x03,0x00,0x00] +; EGPR-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xd0,0x03,0x00,0x00] +; EGPR-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xe0,0x03,0x00,0x00] +; EGPR-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xf0,0x03,0x00,0x00] +; EGPR-NEXT: movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xc0,0x03,0x00,0x00,0x01] +; EGPR-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: # encoding: [0x48,0x89,0x8c,0x24,0xb8,0x03,0x00,0x00] +; EGPR-NEXT: movl %esi, %eax # encoding: [0x89,0xf0] +; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; EGPR-NEXT: # encoding: [0x48,0x8b,0xb4,0x24,0xb8,0x03,0x00,0x00] +; EGPR-NEXT: movw %ax, %cx # encoding: [0x66,0x89,0xc1] +; EGPR-NEXT: movw %di, %ax # encoding: [0x66,0x89,0xf8] +; EGPR-NEXT: # implicit-def: $al +; EGPR-NEXT: movb %al, {{[0-9]+}}(%rsp) # encoding: [0x88,0x84,0x24,0xf0,0x03,0x00,0x00] +; EGPR-NEXT: movw %cx, {{[0-9]+}}(%rsp) # encoding: [0x66,0x89,0x8c,0x24,0xd0,0x03,0x00,0x00] +; EGPR-NEXT: ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0xc0,0x03,0x00,0x00] +; EGPR-NEXT: tileloaddrs (%rdx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4a,0x04,0x32] +; EGPR-NEXT: movl $64, %esi # encoding: [0xbe,0x40,0x00,0x00,0x00] +; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdx # encoding: [0x48,0x8d,0x94,0x24,0x00,0x04,0x00,0x00] +; EGPR-NEXT: tilestored %tmm0, (%rdx,%rsi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x32] +; EGPR-NEXT: movq %rbp, %rsp # encoding: [0x48,0x89,0xec] +; EGPR-NEXT: popq %rbp # encoding: [0x5d] +; EGPR-NEXT: .cfi_def_cfa %rsp, 8 +; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] +; EGPR-NEXT: retq # encoding: [0xc3] entry: %t1 = call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %m, i16 %n, ptr %buf, i64 %s) %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) @@ -48,6 +87,12 @@ define void @test_amx_old(i16 %m, i16 %n, ptr %buf) { ; CHECK-NEXT: movl $32, %eax ; CHECK-NEXT: tileloaddrs (%rdx,%rax), %tmm2 ; CHECK-NEXT: retq +; +; EGPR-LABEL: test_amx_old: +; EGPR: # %bb.0: # %entry +; EGPR-NEXT: movl $32, %eax # encoding: [0xb8,0x20,0x00,0x00,0x00] +; EGPR-NEXT: tileloaddrs (%rdx,%rax), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4a,0x14,0x02] +; EGPR-NEXT: retq # encoding: [0xc3] entry: call void @llvm.x86.tileloaddrs64(i8 2, ptr %buf, i64 32) ret void @@ -88,6 +133,44 @@ define void @test_amx_t1_internal(i16 %m, i16 %n, ptr %buf, i64 %s) { ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 ; CHECK-NEXT: tilerelease ; CHECK-NEXT: retq +; +; EGPR-LABEL: test_amx_t1_internal: +; EGPR: # %bb.0: # %entry +; EGPR-NEXT: pushq %rbp # encoding: [0x55] +; EGPR-NEXT: .cfi_def_cfa_offset 16 +; EGPR-NEXT: .cfi_offset %rbp, -16 +; EGPR-NEXT: movq %rsp, %rbp # encoding: [0x48,0x89,0xe5] +; EGPR-NEXT: .cfi_def_cfa_register %rbp +; EGPR-NEXT: andq $-1024, %rsp # encoding: [0x48,0x81,0xe4,0x00,0xfc,0xff,0xff] +; EGPR-NEXT: # imm = 0xFC00 +; EGPR-NEXT: subq $3072, %rsp # encoding: [0x48,0x81,0xec,0x00,0x0c,0x00,0x00] +; EGPR-NEXT: # imm = 0xC00 +; EGPR-NEXT: xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0] +; EGPR-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xc0,0x03,0x00,0x00] +; EGPR-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xd0,0x03,0x00,0x00] +; EGPR-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xe0,0x03,0x00,0x00] +; EGPR-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xf0,0x03,0x00,0x00] +; EGPR-NEXT: movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xc0,0x03,0x00,0x00,0x01] +; EGPR-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: # encoding: [0x48,0x89,0x8c,0x24,0xb8,0x03,0x00,0x00] +; EGPR-NEXT: movl %esi, %eax # encoding: [0x89,0xf0] +; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; EGPR-NEXT: # encoding: [0x48,0x8b,0xb4,0x24,0xb8,0x03,0x00,0x00] +; EGPR-NEXT: movw %ax, %cx # encoding: [0x66,0x89,0xc1] +; EGPR-NEXT: movw %di, %ax # encoding: [0x66,0x89,0xf8] +; EGPR-NEXT: # implicit-def: $al +; EGPR-NEXT: movb %al, {{[0-9]+}}(%rsp) # encoding: [0x88,0x84,0x24,0xf0,0x03,0x00,0x00] +; EGPR-NEXT: movw %cx, {{[0-9]+}}(%rsp) # encoding: [0x66,0x89,0x8c,0x24,0xd0,0x03,0x00,0x00] +; EGPR-NEXT: ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0xc0,0x03,0x00,0x00] +; EGPR-NEXT: tileloaddrst1 (%rdx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x4a,0x04,0x32] +; EGPR-NEXT: movl $64, %esi # encoding: [0xbe,0x40,0x00,0x00,0x00] +; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdx # encoding: [0x48,0x8d,0x94,0x24,0x00,0x04,0x00,0x00] +; EGPR-NEXT: tilestored %tmm0, (%rdx,%rsi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x32] +; EGPR-NEXT: movq %rbp, %rsp # encoding: [0x48,0x89,0xec] +; EGPR-NEXT: popq %rbp # encoding: [0x5d] +; EGPR-NEXT: .cfi_def_cfa %rsp, 8 +; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] +; EGPR-NEXT: retq # encoding: [0xc3] entry: %t1 = call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %m, i16 %n, ptr %buf, i64 %s) %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) @@ -101,6 +184,12 @@ define void @test_amx_t1_old(i16 %m, i16 %n, ptr %buf) { ; CHECK-NEXT: movl $32, %eax ; CHECK-NEXT: tileloaddrst1 (%rdx,%rax), %tmm2 ; CHECK-NEXT: retq +; +; EGPR-LABEL: test_amx_t1_old: +; EGPR: # %bb.0: # %entry +; EGPR-NEXT: movl $32, %eax # encoding: [0xb8,0x20,0x00,0x00,0x00] +; EGPR-NEXT: tileloaddrst1 (%rdx,%rax), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x4a,0x14,0x02] +; EGPR-NEXT: retq # encoding: [0xc3] entry: call void @llvm.x86.tileloaddrst164(i8 2, ptr %buf, i64 32) ret void diff --git a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll index 146b69773eb18..1f5758c804b2b 100755 --- a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll +++ b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O0 ; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O2 +; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR define void @test_amx(i64 %stride, i8* %addr1) #0 { ; CHECK-LABEL: test_amx: @@ -10,6 +11,14 @@ define void @test_amx(i64 %stride, i8* %addr1) #0 { ; CHECK-NEXT: t2rpntlvwz1rs (%rsi,%rdi), %tmm0 ; CHECK-NEXT: t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 ; CHECK-NEXT: retq +; +; EGPR-LABEL: test_amx: +; EGPR: # %bb.0: +; EGPR-NEXT: t2rpntlvwz0rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x04,0x3e] +; EGPR-NEXT: t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x3e] +; EGPR-NEXT: t2rpntlvwz1rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x04,0x3e] +; EGPR-NEXT: t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x3e] +; EGPR-NEXT: retq # encoding: [0xc3] call void @llvm.x86.t2rpntlvwz0rs(i8 1, i8* %addr1, i64 %stride) call void @llvm.x86.t2rpntlvwz0rst1(i8 2, i8* %addr1, i64 %stride) call void @llvm.x86.t2rpntlvwz1rs(i8 1, i8* %addr1, i64 %stride) @@ -80,6 +89,27 @@ define void @test_amx2(i8* %base, i64 %stride) #0 { ; O2-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 ; O2-NEXT: tilerelease ; O2-NEXT: retq +; +; EGPR-LABEL: test_amx2: +; EGPR: # %bb.0: +; EGPR-NEXT: xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0] +; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xc0] +; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xd0] +; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xe0] +; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xf0] +; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01] +; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08] +; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00] +; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x08] +; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x08,0x00] +; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0] +; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00] +; EGPR-NEXT: t2rpntlvwz0rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x24,0x37] +; EGPR-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x24,0x37] +; EGPR-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x24,0x37] +; EGPR-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x24,0x37] +; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] +; EGPR-NEXT: retq # encoding: [0xc3] call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) diff --git a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll index cc4360317db7d..4cfd97afe721b 100644 --- a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll +++ b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x float> %xmm) #0 { ; CHECK-LABEL: test_amx: @@ -16,6 +17,21 @@ define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x floa ; CHECK-NEXT: tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1 ; CHECK-NEXT: tconjtfp16 %tmm2, %tmm1 ; CHECK-NEXT: retq +; +; EGPR-LABEL: test_amx: +; EGPR: # %bb.0: +; EGPR-NEXT: t2rpntlvwz0 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x04,0x31] +; EGPR-NEXT: t2rpntlvwz0t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x31] +; EGPR-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x04,0x31] +; EGPR-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x31] +; EGPR-NEXT: ttransposed %tmm3, %tmm1 # encoding: [0xc4,0xe2,0x7a,0x5f,0xcb] +; EGPR-NEXT: ttdpbf16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6c,0xca] +; EGPR-NEXT: ttdpfp16ps %tmm6, %tmm5, %tmm4 # encoding: [0xc4,0xe2,0x4b,0x6c,0xe5] +; EGPR-NEXT: ttcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x63,0x6b,0xca] +; EGPR-NEXT: ttcmmrlfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6b,0xca] +; EGPR-NEXT: tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x60,0x6b,0xca] +; EGPR-NEXT: tconjtfp16 %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x79,0x6b,0xca] +; EGPR-NEXT: retq # encoding: [0xc3] call void @llvm.x86.t2rpntlvwz0(i8 1, i8* %addr1, i64 %stride) call void @llvm.x86.t2rpntlvwz0t1(i8 2, i8* %addr1, i64 %stride) call void @llvm.x86.t2rpntlvwz1(i8 1, i8* %addr1, i64 %stride) @@ -78,6 +94,46 @@ define void @test_amx2(i8* %pointer, i8* %base, i64 %stride) #0 { ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq +; +; EGPR-LABEL: test_amx2: +; EGPR: # %bb.0: +; EGPR-NEXT: pushq %rbp # encoding: [0x55] +; EGPR-NEXT: subq $2928, %rsp # encoding: [0x48,0x81,0xec,0x70,0x0b,0x00,0x00] +; EGPR-NEXT: # imm = 0xB70 +; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] +; EGPR-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0x0d] +; EGPR-NEXT: movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x40,0x03,0x00,0x00,0x01] +; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x70,0x03,0x00,0x00,0x08] +; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x50,0x03,0x00,0x00,0x08,0x00] +; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x71,0x03,0x00,0x00,0x08] +; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x52,0x03,0x00,0x00,0x08,0x00] +; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x72,0x03,0x00,0x00,0x08] +; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x54,0x03,0x00,0x00,0x08,0x00] +; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x73,0x03,0x00,0x00,0x08] +; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x56,0x03,0x00,0x00,0x08,0x00] +; EGPR-NEXT: ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0x40,0x03,0x00,0x00] +; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00] +; EGPR-NEXT: tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16] +; EGPR-NEXT: tilezero %tmm1 # encoding: [0xc4,0xe2,0x7b,0x49,0xc8] +; EGPR-NEXT: tilezero %tmm2 # encoding: [0xc4,0xe2,0x7b,0x49,0xd0] +; EGPR-NEXT: ttdpbf16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6c,0xd0] +; EGPR-NEXT: ttdpfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6c,0xd0] +; EGPR-NEXT: ttcmmimfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6b,0xd0] +; EGPR-NEXT: ttcmmrlfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6b,0xd0] +; EGPR-NEXT: movabsq $64, %rbp # encoding: [0x48,0xbd,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00] +; EGPR-NEXT: tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill +; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x94,0x2c,0x80,0x03,0x00,0x00] +; EGPR-NEXT: tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload +; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x9c,0x2c,0x80,0x03,0x00,0x00] +; EGPR-NEXT: tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3 # encoding: [0xc4,0xe2,0x70,0x6b,0xd8] +; EGPR-NEXT: tconjtfp16 %tmm3, %tmm0 # encoding: [0xc4,0xe2,0x79,0x6b,0xc3] +; EGPR-NEXT: tilestored %tmm2, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x14,0x17] +; EGPR-NEXT: addq $2928, %rsp # encoding: [0x48,0x81,0xc4,0x70,0x0b,0x00,0x00] +; EGPR-NEXT: # imm = 0xB70 +; EGPR-NEXT: popq %rbp # encoding: [0x5d] +; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] +; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; EGPR-NEXT: retq # encoding: [0xc3] %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) %b = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) @@ -117,6 +173,30 @@ define void @test_amx3(i8* %pointer, i8* %base, i64 %stride) #0 { ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq +; +; EGPR-LABEL: test_amx3: +; EGPR: # %bb.0: +; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] +; EGPR-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xff] +; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01] +; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf0,0x08] +; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd0,0x08,0x00] +; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08] +; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00] +; EGPR-NEXT: movb $0, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x00] +; EGPR-NEXT: movw $0, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x00,0x00] +; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0] +; EGPR-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; EGPR-NEXT: movw $8, %cx # encoding: [0x66,0xb9,0x08,0x00] +; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16] +; EGPR-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x24,0x16] +; EGPR-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x24,0x16] +; EGPR-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x24,0x16] +; EGPR-NEXT: ttransposed %tmm4, %tmm0 # encoding: [0xc4,0xe2,0x7a,0x5f,0xc4] +; EGPR-NEXT: tilestored %tmm0, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x17] +; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] +; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; EGPR-NEXT: retq # encoding: [0xc3] %1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) %2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) %3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) @@ -179,6 +259,72 @@ define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 { ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq +; +; EGPR-LABEL: test_amx_spill: +; EGPR: # %bb.0: +; EGPR-NEXT: subq $6088, %rsp # encoding: [0x48,0x81,0xec,0xc8,0x17,0x00,0x00] +; EGPR-NEXT: # imm = 0x17C8 +; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] +; EGPR-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xfe] +; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x80,0x01] +; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb0,0x08] +; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x90,0x08,0x00] +; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb4,0x08] +; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x98,0x08,0x00] +; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb5,0x08] +; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9a,0x08,0x00] +; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb6,0x08] +; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9c,0x08,0x00] +; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb7,0x08] +; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9e,0x08,0x00] +; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x80] +; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00] +; EGPR-NEXT: tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16] +; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16] +; EGPR-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x34,0x16] +; EGPR-NEXT: movabsq $64, %rcx # encoding: [0x48,0xb9,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00] +; EGPR-NEXT: tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x0f,0x00,0x00] +; EGPR-NEXT: tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x13,0x00,0x00] +; EGPR-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x34,0x16] +; EGPR-NEXT: tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x07,0x00,0x00] +; EGPR-NEXT: tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x0b,0x00,0x00] +; EGPR-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x34,0x16] +; EGPR-NEXT: tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0x74,0x0c,0xc0] +; EGPR-NEXT: tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x03,0x00,0x00] +; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x34,0x16] +; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] +; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] +; EGPR-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x0f,0x00,0x00] +; EGPR-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x13,0x00,0x00] +; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] +; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] +; EGPR-NEXT: tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x07,0x00,0x00] +; EGPR-NEXT: tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x0b,0x00,0x00] +; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] +; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] +; EGPR-NEXT: tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0x64,0x0c,0xc0] +; EGPR-NEXT: tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x03,0x00,0x00] +; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] +; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] +; EGPR-NEXT: tilestored %tmm6, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x34,0x16] +; EGPR-NEXT: tilestored %tmm7, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x3c,0x16] +; EGPR-NEXT: addq $6088, %rsp # encoding: [0x48,0x81,0xc4,0xc8,0x17,0x00,0x00] +; EGPR-NEXT: # imm = 0x17C8 +; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] +; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; EGPR-NEXT: retq # encoding: [0xc3] %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) %b1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) %b2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt index 6df44c87d2332..57e3153da401b 100755 --- a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt +++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt @@ -96,3 +96,99 @@ # ATT: tileloaddrst1 -32(,%rbp,2), %tmm3 # INTEL: tileloaddrst1 tmm3, [2*rbp - 32] 0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6 +# INTEL: t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456] +0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2 +# INTEL: t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291] +0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz0rs 64(%r18), %tmm6 +# INTEL: t2rpntlvwz0rs tmm6, [r18 + 64] +0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40 + +# ATT: t2rpntlvwz0rs -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz0rs tmm2, [2*rbp - 32] +0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6 +# INTEL: t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456] +0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2 +# INTEL: t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291] +0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz0rst1 64(%r18), %tmm6 +# INTEL: t2rpntlvwz0rst1 tmm6, [r18 + 64] +0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40 + +# ATT: t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz0rst1 tmm2, [2*rbp - 32] +0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6 +# INTEL: t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456] +0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2 +# INTEL: t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291] +0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz1rs 64(%r18), %tmm6 +# INTEL: t2rpntlvwz1rs tmm6, [r18 + 64] +0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40 + +# ATT: t2rpntlvwz1rs -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz1rs tmm2, [2*rbp - 32] +0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6 +# INTEL: t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456] +0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2 +# INTEL: t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291] +0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz1rst1 64(%r18), %tmm6 +# INTEL: t2rpntlvwz1rst1 tmm6, [r18 + 64] +0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40 + +# ATT: t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz1rst1 tmm2, [2*rbp - 32] +0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: tileloaddrs 268435456(%r16,%r14,8), %tmm6 +# INTEL: tileloaddrs tmm6, [r16 + 8*r14 + 268435456] +0x62,0xba,0x7f,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10 + +# ATT: tileloaddrs 291(%r8,%r17,4), %tmm3 +# INTEL: tileloaddrs tmm3, [r8 + 4*r17 + 291] +0x62,0xd2,0x7b,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00 + +# ATT: tileloaddrs 64(%r18), %tmm6 +# INTEL: tileloaddrs tmm6, [r18 + 64] +0x62,0xfa,0x7f,0x08,0x4a,0x74,0x22,0x40 + +# ATT: tileloaddrs -32(,%rbp,2), %tmm3 +# INTEL: tileloaddrs tmm3, [2*rbp - 32] +0x62,0xf2,0x7f,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff + +# ATT: tileloaddrst1 268435456(%r16,%r14,8), %tmm6 +# INTEL: tileloaddrst1 tmm6, [r16 + 8*r14 + 268435456] +0x62,0xba,0x7d,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10 + +# ATT: tileloaddrst1 291(%r8,%r17,4), %tmm3 +# INTEL: tileloaddrst1 tmm3, [r8 + 4*r17 + 291] +0x62,0xd2,0x79,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00 + +# ATT: tileloaddrst1 64(%r18), %tmm6 +# INTEL: tileloaddrst1 tmm6, [r18 + 64] +0x62,0xfa,0x7d,0x08,0x4a,0x74,0x22,0x40 + +# ATT: tileloaddrst1 -32(,%rbp,2), %tmm3 +# INTEL: tileloaddrst1 tmm3, [2*rbp - 32] +0x62,0xf2,0x7d,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff diff --git a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt index 8c6f1be80ba2d..d768630ac1475 100644 --- a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt +++ b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt @@ -49,6 +49,54 @@ # INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32] 0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff +# ATT: t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4 +# INTEL: t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456] +0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz0 291(%r8,%r17,4), %tmm2 +# INTEL: t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291] +0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz0 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz0 tmm2, [2*rbp - 32] +0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4 +# INTEL: t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456] +0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2 +# INTEL: t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291] +0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz0t1 tmm2, [2*rbp - 32] +0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4 +# INTEL: t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456] +0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz1 291(%r8,%r17,4), %tmm2 +# INTEL: t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291] +0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz1 tmm2, [2*rbp - 32] +0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4 +# INTEL: t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456] +0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2 +# INTEL: t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291] +0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32] +0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff + # ATT: ttransposed %tmm1, %tmm2 # INTEL: ttransposed tmm2, tmm1 0xc4,0xe2,0x7a,0x5f,0xd1 diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s index d780ad4f0e369..92db672e1c82d 100755 --- a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s +++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s @@ -86,4 +86,92 @@ // CHECK: tileloaddrst1 -32(,%rbp,2), %tmm3 // CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] - tileloaddrst1 -32(,%rbp,2), %tmm3 \ No newline at end of file + tileloaddrst1 -32(,%rbp,2), %tmm3 + +// CHECK: t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6 +// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6 + +// CHECK: t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2 +// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2 + +// CHECK: t2rpntlvwz0rs 64(%r18), %tmm6 +// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40] + t2rpntlvwz0rs 64(%r18), %tmm6 + +// CHECK: {evex} t2rpntlvwz0rs -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz0rs -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6 +// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6 + +// CHECK: t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2 +// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2 + +// CHECK: t2rpntlvwz0rst1 64(%r18), %tmm6 +// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40] + t2rpntlvwz0rst1 64(%r18), %tmm6 + +// CHECK: {evex} t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6 +// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6 + +// CHECK: t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2 +// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2 + +// CHECK: t2rpntlvwz1rs 64(%r18), %tmm6 +// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40] + t2rpntlvwz1rs 64(%r18), %tmm6 + +// CHECK: {evex} t2rpntlvwz1rs -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz1rs -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6 +// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6 + +// CHECK: t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2 +// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2 + +// CHECK: t2rpntlvwz1rst1 64(%r18), %tmm6 +// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40] + t2rpntlvwz1rst1 64(%r18), %tmm6 + +// CHECK: {evex} t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 + +// CHECK: tileloaddrs 291(%r16,%rax,4), %tmm3 +// CHECK: encoding: [0x62,0xfa,0x7f,0x08,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00] + tileloaddrs 291(%r16,%rax,4), %tmm3 + +// CHECK: tileloaddrs 291(%r8,%r17,4), %tmm3 +// CHECK: encoding: [0x62,0xd2,0x7b,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00] + tileloaddrs 291(%r8,%r17,4), %tmm3 + +// CHECK: {evex} tileloaddrs -32(,%rbp,2), %tmm3 +// CHECK: encoding: [0x62,0xf2,0x7f,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] + {evex} tileloaddrs -32(,%rbp,2), %tmm3 + +// CHECK: tileloaddrst1 291(%r16,%rax,4), %tmm3 +// CHECK: encoding: [0x62,0xfa,0x7d,0x08,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00] + tileloaddrst1 291(%r16,%rax,4), %tmm3 + +// CHECK: tileloaddrst1 291(%r8,%r17,4), %tmm3 +// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00] + tileloaddrst1 291(%r8,%r17,4), %tmm3 + +// CHECK: {evex} tileloaddrst1 -32(,%rbp,2), %tmm3 +// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] + {evex} tileloaddrst1 -32(,%rbp,2), %tmm3 diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s index ccc7ac51a98a4..140d1aa6b198e 100755 --- a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s +++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s @@ -95,3 +95,99 @@ // CHECK: tileloaddrst1 tmm3, [2*rbp - 32] // CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] tileloaddrst1 tmm3, [2*rbp - 32] + +// CHECK: t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291] +// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291] + +// CHECK: t2rpntlvwz0rs tmm6, [r18 + 64] +// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40] + t2rpntlvwz0rs tmm6, [r18 + 64] + +// CHECK: {evex} t2rpntlvwz0rs tmm2, [2*rbp - 32] +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz0rs tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291] +// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291] + +// CHECK: t2rpntlvwz0rst1 tmm6, [r18 + 64] +// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40] + t2rpntlvwz0rst1 tmm6, [r18 + 64] + +// CHECK: {evex} t2rpntlvwz0rst1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz0rst1 tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291] +// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291] + +// CHECK: t2rpntlvwz1rs tmm6, [r18 + 64] +// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40] + t2rpntlvwz1rs tmm6, [r18 + 64] + +// CHECK: {evex} t2rpntlvwz1rs tmm2, [2*rbp - 32] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz1rs tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291] +// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291] + +// CHECK: t2rpntlvwz1rst1 tmm6, [r18 + 64] +// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40] + t2rpntlvwz1rst1 tmm6, [r18 + 64] + +// CHECK: {evex} t2rpntlvwz1rst1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz1rst1 tmm2, [2*rbp - 32] + +// CHECK: tileloaddrs tmm6, [r16 + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xba,0x7f,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10] + tileloaddrs tmm6, [r16 + 8*r14 + 268435456] + +// CHECK: tileloaddrs tmm3, [r8 + 4*r17 + 291] +// CHECK: encoding: [0x62,0xd2,0x7b,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00] + tileloaddrs tmm3, [r8 + 4*r17 + 291] + +// CHECK: tileloaddrs tmm6, [r18 + 64] +// CHECK: encoding: [0x62,0xfa,0x7f,0x08,0x4a,0x74,0x22,0x40] + tileloaddrs tmm6, [r18 + 64] + +// CHECK: {evex} tileloaddrs tmm3, [2*rbp - 32] +// CHECK: encoding: [0x62,0xf2,0x7f,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] + {evex} tileloaddrs tmm3, [2*rbp - 32] + +// CHECK: tileloaddrst1 tmm6, [r16 + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10] + tileloaddrst1 tmm6, [r16 + 8*r14 + 268435456] + +// CHECK: tileloaddrst1 tmm3, [r8 + 4*r17 + 291] +// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00] + tileloaddrst1 tmm3, [r8 + 4*r17 + 291] + +// CHECK: tileloaddrst1 tmm6, [r18 + 64] +// CHECK: encoding: [0x62,0xfa,0x7d,0x08,0x4a,0x74,0x22,0x40] + tileloaddrst1 tmm6, [r18 + 64] + +// CHECK: {evex} tileloaddrst1 tmm3, [2*rbp - 32] +// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] + {evex} tileloaddrst1 tmm3, [2*rbp - 32] diff --git a/llvm/test/MC/X86/amx-transpose-att.s b/llvm/test/MC/X86/amx-transpose-att.s index 21bbf258ac6ef..5158470f8c905 100644 --- a/llvm/test/MC/X86/amx-transpose-att.s +++ b/llvm/test/MC/X86/amx-transpose-att.s @@ -48,6 +48,54 @@ // CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 +// CHECK: t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4 +// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4 + +// CHECK: t2rpntlvwz0 291(%r8,%r17,4), %tmm2 +// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz0 291(%r8,%r17,4), %tmm2 + +// CHECK: {evex} t2rpntlvwz0 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz0 -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4 +// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4 + +// CHECK: t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2 +// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2 + +// CHECK: {evex} t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4 +// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4 + +// CHECK: t2rpntlvwz1 291(%r8,%r17,4), %tmm2 +// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz1 291(%r8,%r17,4), %tmm2 + +// CHECK: {evex} t2rpntlvwz1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz1 -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4 +// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4 + +// CHECK: t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2 +// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2 + +// CHECK: {evex} t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 + // CHECK: ttransposed %tmm1, %tmm5 // CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9] ttransposed %tmm1, %tmm5 diff --git a/llvm/test/MC/X86/amx-transpose-intel.s b/llvm/test/MC/X86/amx-transpose-intel.s index a772232ddbbf2..0d2c22f67a173 100644 --- a/llvm/test/MC/X86/amx-transpose-intel.s +++ b/llvm/test/MC/X86/amx-transpose-intel.s @@ -48,6 +48,54 @@ // CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] t2rpntlvwz1t1 tmm2, [2*rbp - 32] +// CHECK: t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291] +// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291] + +// CHECK: {evex} t2rpntlvwz0 tmm2, [2*rbp - 32] +// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz0 tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291] +// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291] + +// CHECK: {evex} t2rpntlvwz0t1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz0t1 tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291] +// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291] + +// CHECK: {evex} t2rpntlvwz1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz1 tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291] +// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291] + +// CHECK: {evex} t2rpntlvwz1t1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz1t1 tmm2, [2*rbp - 32] + // CHECK: ttransposed tmm5, tmm1 // CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9] ttransposed tmm5, tmm1 diff --git a/llvm/test/TableGen/x86-instr-mapping.inc b/llvm/test/TableGen/x86-instr-mapping.inc index 55d392f5e271f..4f64d4b8d93d0 100644 --- a/llvm/test/TableGen/x86-instr-mapping.inc +++ b/llvm/test/TableGen/x86-instr-mapping.inc @@ -167,6 +167,16 @@ static const X86TableEntry X86CompressEVEXTable[] = { { X86::SHRX64rm_EVEX, X86::SHRX64rm }, { X86::SHRX64rr_EVEX, X86::SHRX64rr }, { X86::STTILECFG_EVEX, X86::STTILECFG }, + { X86::T2RPNTLVWZ0RST1_EVEX, X86::T2RPNTLVWZ0RST1 }, + { X86::T2RPNTLVWZ0RS_EVEX, X86::T2RPNTLVWZ0RS }, + { X86::T2RPNTLVWZ0T1_EVEX, X86::T2RPNTLVWZ0T1 }, + { X86::T2RPNTLVWZ0_EVEX, X86::T2RPNTLVWZ0 }, + { X86::T2RPNTLVWZ1RST1_EVEX, X86::T2RPNTLVWZ1RST1 }, + { X86::T2RPNTLVWZ1RS_EVEX, X86::T2RPNTLVWZ1RS }, + { X86::T2RPNTLVWZ1T1_EVEX, X86::T2RPNTLVWZ1T1 }, + { X86::T2RPNTLVWZ1_EVEX, X86::T2RPNTLVWZ1 }, + { X86::TILELOADDRST1_EVEX, X86::TILELOADDRST1 }, + { X86::TILELOADDRS_EVEX, X86::TILELOADDRS }, { X86::TILELOADDT1_EVEX, X86::TILELOADDT1 }, { X86::TILELOADD_EVEX, X86::TILELOADD }, { X86::TILESTORED_EVEX, X86::TILESTORED }, From ee4282259d5993dfa0b7b8937541dd6ccaadf3d5 Mon Sep 17 00:00:00 2001 From: Nicholas <45984215+liusy58@users.noreply.github.com> Date: Fri, 17 Jan 2025 17:55:55 +0800 Subject: [PATCH 17/45] [BOLT][AArch64]support `inline-small-functions` for AArch64 (#120187) Add some functions in `AArch64MCPlusBuilder.cpp` to support inline for AArch64. --- bolt/lib/Passes/Inliner.cpp | 4 +- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 30 ++++++++++++ bolt/test/AArch64/inline-small-function-1.s | 42 ++++++++++++++++ bolt/test/AArch64/inline-small-function-2.s | 48 +++++++++++++++++++ 4 files changed, 122 insertions(+), 2 deletions(-) create mode 100644 bolt/test/AArch64/inline-small-function-1.s create mode 100644 bolt/test/AArch64/inline-small-function-2.s diff --git a/bolt/lib/Passes/Inliner.cpp b/bolt/lib/Passes/Inliner.cpp index f004a8eeea185..1793f4ff1f148 100644 --- a/bolt/lib/Passes/Inliner.cpp +++ b/bolt/lib/Passes/Inliner.cpp @@ -310,13 +310,13 @@ Inliner::inlineCall(BinaryBasicBlock &CallerBB, if (MIB.isPseudo(Inst)) continue; - MIB.stripAnnotations(Inst, /*KeepTC=*/BC.isX86()); + MIB.stripAnnotations(Inst, /*KeepTC=*/BC.isX86() || BC.isAArch64()); // Fix branch target. Strictly speaking, we don't have to do this as // targets of direct branches will be fixed later and don't matter // in the CFG state. However, disassembly may look misleading, and // hence we do the fixing. - if (MIB.isBranch(Inst)) { + if (MIB.isBranch(Inst) && !MIB.isTailCall(Inst)) { assert(!MIB.isIndirectBranch(Inst) && "unexpected indirect branch in callee"); const BinaryBasicBlock *TargetBB = diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index d752751c17932..d84da10b5bbe6 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -133,6 +133,36 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { public: using MCPlusBuilder::MCPlusBuilder; + MCPhysReg getStackPointer() const override { return AArch64::SP; } + + bool isPush(const MCInst &Inst) const override { return false; } + + bool isPop(const MCInst &Inst) const override { return false; } + + void createCall(MCInst &Inst, const MCSymbol *Target, + MCContext *Ctx) override { + createDirectCall(Inst, Target, Ctx, false); + } + + bool convertTailCallToCall(MCInst &Inst) override { + int NewOpcode; + switch (Inst.getOpcode()) { + default: + return false; + case AArch64::B: + NewOpcode = AArch64::BL; + break; + case AArch64::BR: + NewOpcode = AArch64::BLR; + break; + } + + Inst.setOpcode(NewOpcode); + removeAnnotation(Inst, MCPlus::MCAnnotation::kTailCall); + clearOffset(Inst); + return true; + } + bool equals(const MCTargetExpr &A, const MCTargetExpr &B, CompFuncTy Comp) const override { const auto &AArch64ExprA = cast(A); diff --git a/bolt/test/AArch64/inline-small-function-1.s b/bolt/test/AArch64/inline-small-function-1.s new file mode 100644 index 0000000000000..3ea22a9915fb4 --- /dev/null +++ b/bolt/test/AArch64/inline-small-function-1.s @@ -0,0 +1,42 @@ +## This test checks that inline is properly handled by BOLT on aarch64. + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o +# RUN: %clang %cflags -O0 %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt --inline-small-functions --print-inline --print-only=_Z3barP1A \ +# RUN: %t.exe -o %t.bolt | FileCheck %s + +# CHECK: BOLT-INFO: inlined 0 calls at 1 call sites in 2 iteration(s). Change in binary size: 4 bytes. +# CHECK: Binary Function "_Z3barP1A" after inlining { +# CHECK-NOT: bl _Z3fooP1A +# CHECK: ldr x8, [x0] +# CHECK-NEXT: ldr w0, [x8] + + .text + .globl _Z3fooP1A + .type _Z3fooP1A,@function +_Z3fooP1A: + ldr x8, [x0] + ldr w0, [x8] + ret + .size _Z3fooP1A, .-_Z3fooP1A + + .globl _Z3barP1A + .type _Z3barP1A,@function +_Z3barP1A: + stp x29, x30, [sp, #-16]! + mov x29, sp + bl _Z3fooP1A + mul w0, w0, w0 + ldp x29, x30, [sp], #16 + ret + .size _Z3barP1A, .-_Z3barP1A + + .globl main + .p2align 2 + .type main,@function +main: + mov w0, wzr + ret + .size main, .-main diff --git a/bolt/test/AArch64/inline-small-function-2.s b/bolt/test/AArch64/inline-small-function-2.s new file mode 100644 index 0000000000000..5eb7d391fd157 --- /dev/null +++ b/bolt/test/AArch64/inline-small-function-2.s @@ -0,0 +1,48 @@ +## This test checks that inline is properly handled by BOLT on aarch64. + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o +# RUN: %clang %cflags -O0 %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt --inline-small-functions --print-inline --print-only=test \ +# RUN: %t.exe -o %t.bolt | FileCheck %s + +#CHECK: BOLT-INFO: inlined 0 calls at 1 call sites in 2 iteration(s). Change in binary size: 4 bytes. +#CHECK: Binary Function "test" after inlining { +#CHECK-NOT: bl indirect +#CHECK: add w0, w1, w0 +#CHECK-NEXT: blr x2 + + .text + .globl indirect + .type indirect,@function +indirect: + add w0, w1, w0 + br x2 + .size indirect, .-indirect + + .globl test + .type test,@function +test: + stp x29, x30, [sp, #-32]! + stp x20, x19, [sp, #16] + mov x29, sp + mov w19, w1 + mov w20, w0 + bl indirect + add w8, w19, w20 + cmp w0, #0 + csinc w0, w8, wzr, eq + ldp x20, x19, [sp, #16] + ldp x29, x30, [sp], #32 + ret + .size test, .-test + + .globl main + .type main,@function +main: + mov w0, wzr + ret + .size main, .-main + + \ No newline at end of file From 3b3590aa59f6ba35c746c01c0692621494b62cab Mon Sep 17 00:00:00 2001 From: Sushant Gokhale Date: Fri, 17 Jan 2025 02:05:05 -0800 Subject: [PATCH 18/45] Revert "Revert "[InstCombine] Transform high latency, dependent FSQRT/FDIV into FMUL"" (#123313) Reverts llvm/llvm-project#123289 --- .../InstCombine/InstCombineMulDivRem.cpp | 176 +++++ .../InstCombine/fsqrtdiv-transform.ll | 631 ++++++++++++++++++ 2 files changed, 807 insertions(+) create mode 100644 llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index d0b2ded127ff7..b6acde9bdd110 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -13,6 +13,7 @@ #include "InstCombineInternal.h" #include "llvm/ADT/APInt.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ValueTracking.h" @@ -657,6 +658,94 @@ Instruction *InstCombinerImpl::foldPowiReassoc(BinaryOperator &I) { return nullptr; } +// If we have the following pattern, +// X = 1.0/sqrt(a) +// R1 = X * X +// R2 = a/sqrt(a) +// then this method collects all the instructions that match R1 and R2. +static bool getFSqrtDivOptPattern(Instruction *Div, + SmallPtrSetImpl &R1, + SmallPtrSetImpl &R2) { + Value *A; + if (match(Div, m_FDiv(m_FPOne(), m_Sqrt(m_Value(A)))) || + match(Div, m_FDiv(m_SpecificFP(-1.0), m_Sqrt(m_Value(A))))) { + for (User *U : Div->users()) { + Instruction *I = cast(U); + if (match(I, m_FMul(m_Specific(Div), m_Specific(Div)))) + R1.insert(I); + } + + CallInst *CI = cast(Div->getOperand(1)); + for (User *U : CI->users()) { + Instruction *I = cast(U); + if (match(I, m_FDiv(m_Specific(A), m_Sqrt(m_Specific(A))))) + R2.insert(I); + } + } + return !R1.empty() && !R2.empty(); +} + +// Check legality for transforming +// x = 1.0/sqrt(a) +// r1 = x * x; +// r2 = a/sqrt(a); +// +// TO +// +// r1 = 1/a +// r2 = sqrt(a) +// x = r1 * r2 +// This transform works only when 'a' is known positive. +static bool isFSqrtDivToFMulLegal(Instruction *X, + SmallPtrSetImpl &R1, + SmallPtrSetImpl &R2) { + // Check if the required pattern for the transformation exists. + if (!getFSqrtDivOptPattern(X, R1, R2)) + return false; + + BasicBlock *BBx = X->getParent(); + BasicBlock *BBr1 = (*R1.begin())->getParent(); + BasicBlock *BBr2 = (*R2.begin())->getParent(); + + CallInst *FSqrt = cast(X->getOperand(1)); + if (!FSqrt->hasAllowReassoc() || !FSqrt->hasNoNaNs() || + !FSqrt->hasNoSignedZeros() || !FSqrt->hasNoInfs()) + return false; + + // We change x = 1/sqrt(a) to x = sqrt(a) * 1/a . This change isn't allowed + // by recip fp as it is strictly meant to transform ops of type a/b to + // a * 1/b. So, this can be considered as algebraic rewrite and reassoc flag + // has been used(rather abused)in the past for algebraic rewrites. + if (!X->hasAllowReassoc() || !X->hasAllowReciprocal() || !X->hasNoInfs()) + return false; + + // Check the constraints on X, R1 and R2 combined. + // fdiv instruction and one of the multiplications must reside in the same + // block. If not, the optimized code may execute more ops than before and + // this may hamper the performance. + if (BBx != BBr1 && BBx != BBr2) + return false; + + // Check the constraints on instructions in R1. + if (any_of(R1, [BBr1](Instruction *I) { + // When you have multiple instructions residing in R1 and R2 + // respectively, it's difficult to generate combinations of (R1,R2) and + // then check if we have the required pattern. So, for now, just be + // conservative. + return (I->getParent() != BBr1 || !I->hasAllowReassoc()); + })) + return false; + + // Check the constraints on instructions in R2. + return all_of(R2, [BBr2](Instruction *I) { + // When you have multiple instructions residing in R1 and R2 + // respectively, it's difficult to generate combination of (R1,R2) and + // then check if we have the required pattern. So, for now, just be + // conservative. + return (I->getParent() == BBr2 && I->hasAllowReassoc()); + }); +} + Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) { Value *Op0 = I.getOperand(0); Value *Op1 = I.getOperand(1); @@ -1913,6 +2002,75 @@ static Instruction *foldFDivSqrtDivisor(BinaryOperator &I, return BinaryOperator::CreateFMulFMF(Op0, NewSqrt, &I); } +// Change +// X = 1/sqrt(a) +// R1 = X * X +// R2 = a * X +// +// TO +// +// FDiv = 1/a +// FSqrt = sqrt(a) +// FMul = FDiv * FSqrt +// Replace Uses Of R1 With FDiv +// Replace Uses Of R2 With FSqrt +// Replace Uses Of X With FMul +static Instruction * +convertFSqrtDivIntoFMul(CallInst *CI, Instruction *X, + const SmallPtrSetImpl &R1, + const SmallPtrSetImpl &R2, + InstCombiner::BuilderTy &B, InstCombinerImpl *IC) { + + B.SetInsertPoint(X); + + // Have an instruction that is representative of all of instructions in R1 and + // get the most common fpmath metadata and fast-math flags on it. + Value *SqrtOp = CI->getArgOperand(0); + auto *FDiv = cast( + B.CreateFDiv(ConstantFP::get(X->getType(), 1.0), SqrtOp)); + auto *R1FPMathMDNode = (*R1.begin())->getMetadata(LLVMContext::MD_fpmath); + FastMathFlags R1FMF = (*R1.begin())->getFastMathFlags(); // Common FMF + for (Instruction *I : R1) { + R1FPMathMDNode = MDNode::getMostGenericFPMath( + R1FPMathMDNode, I->getMetadata(LLVMContext::MD_fpmath)); + R1FMF &= I->getFastMathFlags(); + IC->replaceInstUsesWith(*I, FDiv); + IC->eraseInstFromFunction(*I); + } + FDiv->setMetadata(LLVMContext::MD_fpmath, R1FPMathMDNode); + FDiv->copyFastMathFlags(R1FMF); + + // Have a single sqrt call instruction that is representative of all of + // instructions in R2 and get the most common fpmath metadata and fast-math + // flags on it. + auto *FSqrt = cast(CI->clone()); + FSqrt->insertBefore(CI); + auto *R2FPMathMDNode = (*R2.begin())->getMetadata(LLVMContext::MD_fpmath); + FastMathFlags R2FMF = (*R2.begin())->getFastMathFlags(); // Common FMF + for (Instruction *I : R2) { + R2FPMathMDNode = MDNode::getMostGenericFPMath( + R2FPMathMDNode, I->getMetadata(LLVMContext::MD_fpmath)); + R2FMF &= I->getFastMathFlags(); + IC->replaceInstUsesWith(*I, FSqrt); + IC->eraseInstFromFunction(*I); + } + FSqrt->setMetadata(LLVMContext::MD_fpmath, R2FPMathMDNode); + FSqrt->copyFastMathFlags(R2FMF); + + Instruction *FMul; + // If X = -1/sqrt(a) initially,then FMul = -(FDiv * FSqrt) + if (match(X, m_FDiv(m_SpecificFP(-1.0), m_Specific(CI)))) { + Value *Mul = B.CreateFMul(FDiv, FSqrt); + FMul = cast(B.CreateFNeg(Mul)); + } else + FMul = cast(B.CreateFMul(FDiv, FSqrt)); + FMul->copyMetadata(*X); + FMul->copyFastMathFlags(FastMathFlags::intersectRewrite(R1FMF, R2FMF) | + FastMathFlags::unionValue(R1FMF, R2FMF)); + IC->replaceInstUsesWith(*X, FMul); + return IC->eraseInstFromFunction(*X); +} + Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) { Module *M = I.getModule(); @@ -1937,6 +2095,24 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) { return R; Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + // Convert + // x = 1.0/sqrt(a) + // r1 = x * x; + // r2 = a/sqrt(a); + // + // TO + // + // r1 = 1/a + // r2 = sqrt(a) + // x = r1 * r2 + SmallPtrSet R1, R2; + if (isFSqrtDivToFMulLegal(&I, R1, R2)) { + CallInst *CI = cast(I.getOperand(1)); + if (Instruction *D = convertFSqrtDivIntoFMul(CI, &I, R1, R2, Builder, this)) + return D; + } + if (isa(Op0)) if (SelectInst *SI = dyn_cast(Op1)) if (Instruction *R = FoldOpIntoSelect(I, SI)) diff --git a/llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll b/llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll new file mode 100644 index 0000000000000..6296954333e8a --- /dev/null +++ b/llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll @@ -0,0 +1,631 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -passes='instcombine' < %s | FileCheck %s + +@x = global double 0.000000e+00 +@r1 = global double 0.000000e+00 +@r2 = global double 0.000000e+00 +@r3 = global double 0.000000e+00 +@v = global [2 x double] zeroinitializer +@v1 = global [2 x double] zeroinitializer +@v2 = global [2 x double] zeroinitializer + +; div/mul/div1 in the same block. +define void @bb_constraint_case1(double %a) { +; CHECK-LABEL: define void @bb_constraint_case1( +; CHECK-SAME: double [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]] +; CHECK-NEXT: [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: store double [[TMP0]], ptr @r1, align 8 +; CHECK-NEXT: store double [[SQRT1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +; div/mul in one block and div1 in other block with conditional guard. +define void @bb_constraint_case2(double %a, i32 %d) { +; CHECK-LABEL: define void @bb_constraint_case2( +; CHECK-SAME: double [[A:%.*]], i32 [[D:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]] +; CHECK-NEXT: [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: store double [[TMP0]], ptr @r1, align 8 +; CHECK-NEXT: [[D_NOT:%.*]] = icmp eq i32 [[D]], 0 +; CHECK-NEXT: br i1 [[D_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: store double [[SQRT1]], ptr @r2, align 8 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + %d.not = icmp eq i32 %d, 0 + br i1 %d.not, label %if.end, label %if.then + +if.then: ; preds = %entry + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; div in one block. mul/div1 in other block and conditionally guarded. Don't optimize. +define void @bb_constraint_case3(double %a, i32 %d) { +; CHECK-LABEL: define void @bb_constraint_case3( +; CHECK-SAME: double [[A:%.*]], i32 [[D:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: [[D_NOT:%.*]] = icmp eq i32 [[D]], 0 +; CHECK-NEXT: br i1 [[D_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]] +; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8 +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]] +; CHECK-NEXT: store double [[DIV1]], ptr @r2, align 8 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %d.not = icmp eq i32 %d, 0 + br i1 %d.not, label %if.end, label %if.then + +if.then: ; preds = %entry + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; div in one block. mul/div1 each in different block and conditionally guarded. Don't optimize. +define void @bb_constraint_case4(double %a, i32 %c, i32 %d) { +; CHECK-LABEL: define void @bb_constraint_case4( +; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: [[C_NOT:%.*]] = icmp eq i32 [[C]], 0 +; CHECK-NEXT: br i1 [[C_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]] +; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[D_NOT:%.*]] = icmp eq i32 [[D]], 0 +; CHECK-NEXT: br i1 [[D_NOT]], label [[IF_END1:%.*]], label [[IF_THEN1:%.*]] +; CHECK: if.then1: +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]] +; CHECK-NEXT: store double [[DIV1]], ptr @r2, align 8 +; CHECK-NEXT: br label [[IF_END1]] +; CHECK: if.end1: +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %c.not = icmp eq i32 %c, 0 + br i1 %c.not, label %if.end, label %if.then + +if.then: ; preds = %entry + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + br label %if.end + +if.end: ; preds = %if.then, %entry + %d.not = icmp eq i32 %d, 0 + br i1 %d.not, label %if.end1, label %if.then1 + +if.then1: ; preds = %if.end + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + br label %if.end1 + +if.end1: ; preds = %if.then1, %if.end + ret void +} + +; sqrt value comes from different blocks. Don't optimize. +define void @bb_constraint_case5(double %a, i32 %c) { +; CHECK-LABEL: define void @bb_constraint_case5( +; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_NOT:%.*]] = icmp eq i32 [[C]], 0 +; CHECK-NEXT: br i1 [[C_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[TMP0:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: br label [[IF_END:%.*]] +; CHECK: if.else: +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[A]], 1.000000e+01 +; CHECK-NEXT: [[TMP1:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[ADD]]) +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[SQRT:%.*]] = phi double [ [[TMP0]], [[IF_THEN]] ], [ [[TMP1]], [[IF_ELSE]] ] +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]] +; CHECK-NEXT: [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]] +; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8 +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]] +; CHECK-NEXT: store double [[DIV1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %c.not = icmp eq i32 %c, 0 + br i1 %c.not, label %if.else, label %if.then + +if.then: ; preds = %entry + %0 = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + br label %if.end + +if.else: ; preds = %entry + %add = fadd double %a, 1.000000e+01 + %1 = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %add) + br label %if.end + +if.end: ; preds = %if.else, %if.then + %sqrt = phi double[ %0, %if.then], [ %1, %if.else] + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +; div in one block and conditionally guarded. mul/div1 in other block. Don't optimize. +define void @bb_constraint_case6(double %a, i32 %d) { +; CHECK-LABEL: define void @bb_constraint_case6( +; CHECK-SAME: double [[A:%.*]], i32 [[D:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[D_NOT:%.*]] = icmp eq i32 [[D]], 0 +; CHECK-NEXT: br i1 [[D_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.else: +; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr @x, align 8 +; CHECK-NEXT: br label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[TMP1:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]] +; CHECK-NEXT: store double [[TMP1]], ptr @x, align 8 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[DIV:%.*]] = phi double [ [[TMP0]], [[IF_ELSE]] ], [ [[TMP1]], [[IF_THEN]] ] +; CHECK-NEXT: [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]] +; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8 +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]] +; CHECK-NEXT: store double [[DIV1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %d.not = icmp eq i32 %d, 0 + br i1 %d.not, label %if.else, label %if.then + +if.else: ; preds = %entry + %1 = load double, ptr @x + br label %if.end + +if.then: ; preds = %entry + %2 = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %2, ptr @x + br label %if.end + +if.end: ; preds = %if.else, %if.then + %div = phi double [ %1, %if.else ], [ %2, %if.then ] + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +; value for mul comes from different blocks. Don't optimize. +define void @bb_constraint_case7(double %a, i32 %c, i32 %d) { +; CHECK-LABEL: define void @bb_constraint_case7( +; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: [[C_NOT:%.*]] = icmp eq i32 [[C]], 0 +; CHECK-NEXT: br i1 [[C_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[TMP0:%.*]] = fdiv double 3.000000e+00, [[A]] +; CHECK-NEXT: br label [[IF_END:%.*]] +; CHECK: if.else: +; CHECK-NEXT: [[D_NOT:%.*]] = icmp eq i32 [[D]], 0 +; CHECK-NEXT: br i1 [[D_NOT]], label [[IF_ELSE1:%.*]], label [[IF_THEN1:%.*]] +; CHECK: if.then1: +; CHECK-NEXT: [[TMP1:%.*]] = fdiv double 2.000000e+00, [[A]] +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.else1: +; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc double [[DIV]], [[DIV]] +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[MUL:%.*]] = phi double [ [[TMP1]], [[IF_THEN1]] ], [ [[TMP2]], [[IF_ELSE1]] ], [ [[TMP0]], [[IF_THEN]] ] +; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8 +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]] +; CHECK-NEXT: store double [[DIV1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %c.not = icmp eq i32 %c, 0 + br i1 %c.not, label %if.else, label %if.then + +if.then: ; preds = %entry + %1 = fdiv double 3.000000e+00, %a + br label %if.end + +if.else: ; preds = %entry + %d.not = icmp eq i32 %d, 0 + br i1 %d.not, label %if.else1, label %if.then1 + +if.then1: ; preds = %if.else + %2 = fdiv double 2.000000e+00, %a + br label %if.end + +if.else1: ; preds = %if.else + %3 = fmul reassoc double %div, %div + br label %if.end + +if.end: ; preds = %if.then1, %if.else1, %if.then + %mul = phi double [ %2, %if.then1 ], [ %3, %if.else1 ], [ %1, %if.then ] + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +; value of mul comes from two different blocks(as shown by select ins). +define void @bb_constraint_case8(double %a, i32 %c) { +; CHECK-LABEL: define void @bb_constraint_case8( +; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]] +; CHECK-NEXT: [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: [[C_NOT:%.*]] = icmp eq i32 [[C]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = fmul double [[A]], [[A]] +; CHECK-NEXT: [[MUL:%.*]] = select i1 [[C_NOT]], double [[TMP1]], double [[TMP0]] +; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8 +; CHECK-NEXT: store double [[SQRT1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %c.not = icmp eq i32 %c, 0 + %1 = fmul double %a, %a + %2 = fmul reassoc double %div, %div + %mul = select i1 %c.not, double %1, double %2 + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +; multiple instances of multiply ops to optimize. Optimize all. +define void @mutiple_multiply_instances(double %a, i32 %c) { +; CHECK-LABEL: define void @mutiple_multiply_instances( +; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[TMP1:%.*]] = fdiv reassoc double 1.000000e+00, [[A]] +; CHECK-NEXT: [[DIV:%.*]] = fmul reassoc double [[TMP1]], [[SQRT1]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: [[C_NOT:%.*]] = icmp eq i32 [[C]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = fmul double [[A]], [[A]] +; CHECK-NEXT: [[TMP3:%.*]] = fmul double [[A]], [[A]] +; CHECK-NEXT: [[MUL1:%.*]] = select i1 [[C_NOT]], double [[TMP2]], double [[TMP1]] +; CHECK-NEXT: [[MUL2:%.*]] = select i1 [[C_NOT]], double [[TMP1]], double [[TMP3]] +; CHECK-NEXT: store double [[MUL1]], ptr @r1, align 8 +; CHECK-NEXT: store double [[MUL2]], ptr @r3, align 8 +; CHECK-NEXT: store double [[SQRT1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %c.not = icmp eq i32 %c, 0 + %1 = fmul double %a, %a + %2 = fmul double %a, %a + %3 = fmul reassoc double %div, %div + %4 = fmul reassoc double %div, %div + %mul1 = select i1 %c.not, double %1, double %3 + %mul2 = select i1 %c.not, double %4, double %2 + store double %mul1, ptr @r1 + store double %mul2, ptr @r3 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +; missing flags for optimization. +define void @missing_arcp_flag_on_div(double %a) { +; CHECK-LABEL: define void @missing_arcp_flag_on_div( +; CHECK-SAME: double [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc ninf double 1.000000e+00, [[SQRT]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]] +; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8 +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]] +; CHECK-NEXT: store double [[DIV1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +; missing flags for optimization. +define void @missing_reassoc_flag_on_mul(double %a) { +; CHECK-LABEL: define void @missing_reassoc_flag_on_mul( +; CHECK-SAME: double [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[DIV]], [[DIV]] +; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8 +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]] +; CHECK-NEXT: store double [[DIV1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %mul = fmul double %div, %div + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +; missing flags for optimization. +define void @missing_reassoc_flag_on_div1(double %a) { +; CHECK-LABEL: define void @missing_reassoc_flag_on_div1( +; CHECK-SAME: double [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]] +; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8 +; CHECK-NEXT: [[DIV1:%.*]] = fdiv double [[A]], [[SQRT]] +; CHECK-NEXT: store double [[DIV1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + %div1 = fdiv double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +; div = -1/sqrt(a) +define void @negative_fdiv_val(double %a) { +; CHECK-LABEL: define void @negative_fdiv_val( +; CHECK-SAME: double [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = fneg reassoc double [[SQRT1]] +; CHECK-NEXT: [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[TMP1]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: store double [[TMP0]], ptr @r1, align 8 +; CHECK-NEXT: store double [[SQRT1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double -1.000000e+00, %sqrt + store double %div, ptr @x + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +define void @fpmath_metadata_on_div1(double %a) { +; CHECK-LABEL: define void @fpmath_metadata_on_div1( +; CHECK-SAME: double [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]), !fpmath [[META0:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]] +; CHECK-NEXT: [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: store double [[TMP0]], ptr @r1, align 8 +; CHECK-NEXT: store double [[SQRT1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt, !fpmath !3 + store double %div1, ptr @r2 + ret void +} + +define void @fpmath_metadata_on_mul(double %a) { +; CHECK-LABEL: define void @fpmath_metadata_on_mul( +; CHECK-SAME: double [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]], !fpmath [[META1:![0-9]+]] +; CHECK-NEXT: [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: store double [[TMP0]], ptr @r1, align 8 +; CHECK-NEXT: store double [[SQRT1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %mul = fmul reassoc double %div, %div, !fpmath !2 + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +; FIXME: DIV in the result should get the fpmath metadata from %div. +define void @fpmath_metadata_on_div(double %a) { +; CHECK-LABEL: define void @fpmath_metadata_on_div( +; CHECK-SAME: double [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]] +; CHECK-NEXT: [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]], !fpmath [[META2:![0-9]+]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: store double [[TMP0]], ptr @r1, align 8 +; CHECK-NEXT: store double [[SQRT1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt, !fpmath !1 + store double %div, ptr @x + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +define void @fpmath_metadata_on_all(double %a) { +; CHECK-LABEL: define void @fpmath_metadata_on_all( +; CHECK-SAME: double [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]), !fpmath [[META0]] +; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]], !fpmath [[META1]] +; CHECK-NEXT: [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]], !fpmath [[META2]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: store double [[TMP0]], ptr @r1, align 8 +; CHECK-NEXT: store double [[SQRT1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a), !fpmath !0 + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt, !fpmath !1 + store double %div, ptr @x + %mul = fmul reassoc double %div, %div, !fpmath !2 + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt, !fpmath !3 + store double %div1, ptr @r2 + ret void +} + +define void @vector_input(<2 x double> %a) { +; CHECK-LABEL: define void @vector_input( +; CHECK-SAME: <2 x double> [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT1:%.*]] = call reassoc <2 x double> @llvm.sqrt.v2f64(<2 x double> [[A]]) +; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc <2 x double> splat (double 1.000000e+00), [[A]] +; CHECK-NEXT: [[DIV:%.*]] = fmul reassoc <2 x double> [[TMP0]], [[SQRT1]] +; CHECK-NEXT: store <2 x double> [[DIV]], ptr @v, align 16 +; CHECK-NEXT: store <2 x double> [[TMP0]], ptr @v1, align 16 +; CHECK-NEXT: store <2 x double> [[SQRT1]], ptr @v2, align 16 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) + %div = fdiv reassoc arcp ninf <2 x double>, %sqrt + store <2 x double> %div, ptr @v + %mul = fmul reassoc <2 x double> %div, %div + store <2 x double> %mul, ptr @v1 + %div1 = fdiv reassoc <2 x double> %a, %sqrt + store <2 x double> %div1, ptr @v2 + ret void +} + +define void @strict_fp_metadata(double %a) { +; CHECK-LABEL: define void @strict_fp_metadata( +; CHECK-SAME: double [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 1, metadata !"round.dynamic", metadata !"fpexcept.strict") +; CHECK-NEXT: [[CALL:%.*]] = call double @llvm.sqrt.f64(double noundef [[A]]) +; CHECK-NEXT: [[DIV:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[CONV]], double [[CALL]], metadata !"round.dynamic", metadata !"fpexcept.strict") +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: [[MUL:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[DIV]], double [[DIV]], metadata !"round.dynamic", metadata !"fpexcept.strict") +; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8 +; CHECK-NEXT: [[DIV2:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[A]], double [[CALL]], metadata !"round.dynamic", metadata !"fpexcept.strict") +; CHECK-NEXT: store double [[DIV2]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %conv = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 1, metadata !"round.dynamic", metadata !"fpexcept.strict") + %call = call double @llvm.sqrt.f64(double noundef %a) + %div = call double @llvm.experimental.constrained.fdiv.f64(double %conv, double %call, metadata !"round.dynamic", metadata !"fpexcept.strict") + store double %div, ptr @x + %mul = call double @llvm.experimental.constrained.fmul.f64(double %div, double %div, metadata !"round.dynamic", metadata !"fpexcept.strict") + store double %mul, ptr @r1 + %div2 = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %call, metadata !"round.dynamic", metadata !"fpexcept.strict") + store double %div2, ptr @r2 + ret void +} + +declare double @llvm.experimental.constrained.sitofp.f64.i32(i32, metadata, metadata) +declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata) +declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) +declare double @llvm.sqrt.f64(double) +declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) + +!0 = !{float 2.5} +!1 = !{float 3.5} +!2 = !{float 4.5} +!3 = !{float 5.5} +; CHECK: [[META0]] = !{float 5.500000e+00} +; CHECK: [[META1]] = !{float 4.500000e+00} +; CHECK: [[META2]] = !{float 3.500000e+00} From 2c9dc089fd6aeb7570206b0a8b36cfb9298c2893 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Fri, 17 Jan 2025 10:09:31 +0000 Subject: [PATCH 19/45] [AArch64] Use spill size when calculating callee saves size (NFC) (#123086) This is an NFC right now, as currently, all register and spill sizes are the same, but the spill size is the correct size to use here. --- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 206e410047db5..dd248cf39a5ce 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -3795,14 +3795,15 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, unsigned CSStackSize = 0; unsigned SVECSStackSize = 0; const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - const MachineRegisterInfo &MRI = MF.getRegInfo(); for (unsigned Reg : SavedRegs.set_bits()) { - auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8; + auto *RC = TRI->getMinimalPhysRegClass(Reg); + assert(RC && "expected register class!"); + auto SpillSize = TRI->getSpillSize(*RC); if (AArch64::PPRRegClass.contains(Reg) || AArch64::ZPRRegClass.contains(Reg)) - SVECSStackSize += RegSize; + SVECSStackSize += SpillSize; else - CSStackSize += RegSize; + CSStackSize += SpillSize; } // Increase the callee-saved stack size if the function has streaming mode From 32a4650f3c76efee3bd515e25d70ae39d980b071 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Fri, 17 Jan 2025 10:10:21 +0000 Subject: [PATCH 20/45] [AArch64] Avoid hardcoding spill size/align in FrameLowering (NFC) (#123080) This is already defined for each register class in AArch64RegisterInfo, not hardcoding it here makes these values easier to change (perhaps based on hardware mode). --- .../Target/AArch64/AArch64FrameLowering.cpp | 69 ++++++------------- 1 file changed, 20 insertions(+), 49 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index dd248cf39a5ce..1582d1999ca1d 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -2926,26 +2926,12 @@ struct RegPairInfo { int FrameIdx; int Offset; enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type; + const TargetRegisterClass *RC; RegPairInfo() = default; bool isPaired() const { return Reg2 != AArch64::NoRegister; } - unsigned getScale() const { - switch (Type) { - case PPR: - return 2; - case GPR: - case FPR64: - case VG: - return 8; - case ZPR: - case FPR128: - return 16; - } - llvm_unreachable("Unsupported type"); - } - bool isScalable() const { return Type == PPR || Type == ZPR; } }; @@ -3023,20 +3009,27 @@ static void computeCalleeSaveRegisterPairs( RegPairInfo RPI; RPI.Reg1 = CSI[i].getReg(); - if (AArch64::GPR64RegClass.contains(RPI.Reg1)) + if (AArch64::GPR64RegClass.contains(RPI.Reg1)) { RPI.Type = RegPairInfo::GPR; - else if (AArch64::FPR64RegClass.contains(RPI.Reg1)) + RPI.RC = &AArch64::GPR64RegClass; + } else if (AArch64::FPR64RegClass.contains(RPI.Reg1)) { RPI.Type = RegPairInfo::FPR64; - else if (AArch64::FPR128RegClass.contains(RPI.Reg1)) + RPI.RC = &AArch64::FPR64RegClass; + } else if (AArch64::FPR128RegClass.contains(RPI.Reg1)) { RPI.Type = RegPairInfo::FPR128; - else if (AArch64::ZPRRegClass.contains(RPI.Reg1)) + RPI.RC = &AArch64::FPR128RegClass; + } else if (AArch64::ZPRRegClass.contains(RPI.Reg1)) { RPI.Type = RegPairInfo::ZPR; - else if (AArch64::PPRRegClass.contains(RPI.Reg1)) + RPI.RC = &AArch64::ZPRRegClass; + } else if (AArch64::PPRRegClass.contains(RPI.Reg1)) { RPI.Type = RegPairInfo::PPR; - else if (RPI.Reg1 == AArch64::VG) + RPI.RC = &AArch64::PPRRegClass; + } else if (RPI.Reg1 == AArch64::VG) { RPI.Type = RegPairInfo::VG; - else + RPI.RC = &AArch64::FIXED_REGSRegClass; + } else { llvm_unreachable("Unsupported register class."); + } // Add the stack hazard size as we transition from GPR->FPR CSRs. if (AFI->hasStackHazardSlotIndex() && @@ -3045,7 +3038,7 @@ static void computeCalleeSaveRegisterPairs( ByteOffset += StackFillDir * StackHazardSize; LastReg = RPI.Reg1; - int Scale = RPI.getScale(); + int Scale = TRI->getSpillSize(*RPI.RC); // Add the next reg to the pair if it is in the same register class. if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) { Register NextReg = CSI[i + RegInc].getReg(); @@ -3254,38 +3247,26 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( // Rationale: This sequence saves uop updates compared to a sequence of // pre-increment spills like stp xi,xj,[sp,#-16]! // Note: Similar rationale and sequence for restores in epilog. - unsigned Size; - Align Alignment; + unsigned Size = TRI->getSpillSize(*RPI.RC); + Align Alignment = TRI->getSpillAlign(*RPI.RC); switch (RPI.Type) { case RegPairInfo::GPR: StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui; - Size = 8; - Alignment = Align(8); break; case RegPairInfo::FPR64: StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui; - Size = 8; - Alignment = Align(8); break; case RegPairInfo::FPR128: StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui; - Size = 16; - Alignment = Align(16); break; case RegPairInfo::ZPR: StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI; - Size = 16; - Alignment = Align(16); break; case RegPairInfo::PPR: StrOpc = AArch64::STR_PXI; - Size = 2; - Alignment = Align(2); break; case RegPairInfo::VG: StrOpc = AArch64::STRXui; - Size = 8; - Alignment = Align(8); break; } @@ -3495,33 +3476,23 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( // ldp x22, x21, [sp, #0] // addImm(+0) // Note: see comment in spillCalleeSavedRegisters() unsigned LdrOpc; - unsigned Size; - Align Alignment; + unsigned Size = TRI->getSpillSize(*RPI.RC); + Align Alignment = TRI->getSpillAlign(*RPI.RC); switch (RPI.Type) { case RegPairInfo::GPR: LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui; - Size = 8; - Alignment = Align(8); break; case RegPairInfo::FPR64: LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui; - Size = 8; - Alignment = Align(8); break; case RegPairInfo::FPR128: LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui; - Size = 16; - Alignment = Align(16); break; case RegPairInfo::ZPR: LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI; - Size = 16; - Alignment = Align(16); break; case RegPairInfo::PPR: LdrOpc = AArch64::LDR_PXI; - Size = 2; - Alignment = Align(2); break; case RegPairInfo::VG: continue; From e79bb8731ae9089f0635e5634883267a091e318d Mon Sep 17 00:00:00 2001 From: Sushant Gokhale Date: Fri, 17 Jan 2025 02:14:04 -0800 Subject: [PATCH 21/45] [InstCombine] Fixup commit 7253c6f (#123315) This should fix the assert failure we were getting for the darwin OS. --- llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index b6acde9bdd110..df5f9833a2ff9 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -2067,8 +2067,7 @@ convertFSqrtDivIntoFMul(CallInst *CI, Instruction *X, FMul->copyMetadata(*X); FMul->copyFastMathFlags(FastMathFlags::intersectRewrite(R1FMF, R2FMF) | FastMathFlags::unionValue(R1FMF, R2FMF)); - IC->replaceInstUsesWith(*X, FMul); - return IC->eraseInstFromFunction(*X); + return IC->replaceInstUsesWith(*X, FMul); } Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) { From 9491f75e1d912b277247450d1c7b6d56f7faf885 Mon Sep 17 00:00:00 2001 From: Hassnaa Hamdi Date: Fri, 17 Jan 2025 10:34:57 +0000 Subject: [PATCH 22/45] Reland: [LV]: Teach LV to recursively (de)interleave. (#122989) This commit relands the changes from "[LV]: Teach LV to recursively (de)interleave. #89018" Reason for revert: - The patch exposed a bug in the IA pass, the bug is now fixed and landed by commit: #122643 --- .../Transforms/Vectorize/LoopVectorize.cpp | 14 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 79 +- .../AArch64/sve-interleaved-accesses.ll | 260 +++- .../sve-interleaved-masked-accesses.ll | 252 ++++ .../RISCV/interleaved-accesses.ll | 1318 +++++++++-------- .../AArch64/sve-interleave-vectorization.ll | 135 ++ 6 files changed, 1387 insertions(+), 671 deletions(-) create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 8024cde41b5f9..6df11abda9e98 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3505,10 +3505,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( if (hasIrregularType(ScalarTy, DL)) return false; - // We currently only know how to emit interleave/deinterleave with - // Factor=2 for scalable vectors. This is purely an implementation - // limit. - if (VF.isScalable() && InterleaveFactor != 2) + // For scalable vectors, the only interleave factor currently supported + // must be power of 2 since we require the (de)interleave2 intrinsics + // instead of shufflevectors. + if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor)) return false; // If the group involves a non-integral pointer, we may not be able to @@ -9435,9 +9435,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); // For scalable vectors, the only interleave factor currently supported - // is 2 since we require the (de)interleave2 intrinsics instead of - // shufflevectors. - assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && + // must be power of 2 since we require the (de)interleave2 intrinsics + // instead of shufflevectors. + assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) && "Unsupported interleave factor for scalable vectors"); return Result; }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 979a8e0768a99..5ae2f43e4950c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2863,10 +2863,21 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef Vals, // Scalable vectors cannot use arbitrary shufflevectors (only splats), so // must use intrinsics to interleave. if (VecTy->isScalableTy()) { - VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); - return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2, - Vals, - /*FMFSource=*/nullptr, Name); + assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for " + "scalable vectors, must be power of 2"); + SmallVector InterleavingValues(Vals); + // When interleaving, the number of values will be shrunk until we have the + // single final interleaved value. + auto *InterleaveTy = cast(InterleavingValues[0]->getType()); + for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) { + InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy); + for (unsigned I = 0; I < Midpoint; ++I) + InterleavingValues[I] = Builder.CreateIntrinsic( + InterleaveTy, Intrinsic::vector_interleave2, + {InterleavingValues[I], InterleavingValues[Midpoint + I]}, + /*FMFSource=*/nullptr, Name); + } + return InterleavingValues[0]; } // Fixed length. Start by concatenating all vectors into a wide vector. @@ -2952,15 +2963,11 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { &InterleaveFactor](Value *MaskForGaps) -> Value * { if (State.VF.isScalable()) { assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); - assert(InterleaveFactor == 2 && + assert(isPowerOf2_32(InterleaveFactor) && "Unsupported deinterleave factor for scalable vectors"); auto *ResBlockInMask = State.get(BlockInMask); - SmallVector Ops = {ResBlockInMask, ResBlockInMask}; - auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(), - State.VF.getKnownMinValue() * 2, true); - return State.Builder.CreateIntrinsic( - MaskTy, Intrinsic::vector_interleave2, Ops, - /*FMFSource=*/nullptr, "interleaved.mask"); + SmallVector Ops(InterleaveFactor, ResBlockInMask); + return interleaveVectors(State.Builder, Ops, "interleaved.mask"); } if (!BlockInMask) @@ -3000,22 +3007,48 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { ArrayRef VPDefs = definedValues(); const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); if (VecTy->isScalableTy()) { - assert(InterleaveFactor == 2 && + assert(isPowerOf2_32(InterleaveFactor) && "Unsupported deinterleave factor for scalable vectors"); - // Scalable vectors cannot use arbitrary shufflevectors (only splats), - // so must use intrinsics to deinterleave. - Value *DI = State.Builder.CreateIntrinsic( - Intrinsic::vector_deinterleave2, VecTy, NewLoad, - /*FMFSource=*/nullptr, "strided.vec"); - unsigned J = 0; - for (unsigned I = 0; I < InterleaveFactor; ++I) { - Instruction *Member = Group->getMember(I); + // Scalable vectors cannot use arbitrary shufflevectors (only splats), + // so must use intrinsics to deinterleave. + SmallVector DeinterleavedValues(InterleaveFactor); + DeinterleavedValues[0] = NewLoad; + // For the case of InterleaveFactor > 2, we will have to do recursive + // deinterleaving, because the current available deinterleave intrinsic + // supports only Factor of 2, otherwise it will bailout after first + // iteration. + // When deinterleaving, the number of values will double until we + // have "InterleaveFactor". + for (unsigned NumVectors = 1; NumVectors < InterleaveFactor; + NumVectors *= 2) { + // Deinterleave the elements within the vector + SmallVector TempDeinterleavedValues(NumVectors); + for (unsigned I = 0; I < NumVectors; ++I) { + auto *DiTy = DeinterleavedValues[I]->getType(); + TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic( + Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I], + /*FMFSource=*/nullptr, "strided.vec"); + } + // Extract the deinterleaved values: + for (unsigned I = 0; I < 2; ++I) + for (unsigned J = 0; J < NumVectors; ++J) + DeinterleavedValues[NumVectors * I + J] = + State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I); + } - if (!Member) +#ifndef NDEBUG + for (Value *Val : DeinterleavedValues) + assert(Val && "NULL Deinterleaved Value"); +#endif + for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); + Value *StridedVec = DeinterleavedValues[I]; + if (!Member) { + // This value is not needed as it's not used + cast(StridedVec)->eraseFromParent(); continue; - - Value *StridedVec = State.Builder.CreateExtractValue(DI, I); + } // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index bf95622733461..05c0bc0761ea4 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -396,8 +396,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP9]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) ; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP10]]) ; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP10]]) ; CHECK-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP11]]) ; CHECK-NEXT: [[TMP12:%.*]] = add nsw [[REVERSE]], [[VEC_IND]] ; CHECK-NEXT: [[TMP13:%.*]] = sub nsw [[REVERSE1]], [[VEC_IND]] @@ -1548,5 +1548,263 @@ end: ret void } +; Check vectorization on an interleaved load/store groups of factor 4 + +; for (int i = 0; i < 1024; ++i) { +; dst[i].x = a[i].x + b[i].x; +; dst[i].y = a[i].y - b[i].y; +; dst[i].z = a[i].z << b[i].z; +; dst[i].t = a[i].t >> b[i].t; +; } +%struct.xyzt = type { i32, i32, i32, i32 } + +define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a, ptr readonly %b) { +; CHECK-LABEL: @interleave_deinterleave( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], 1024 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 2 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP7]]) +; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP8]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC7]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC7]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load , ptr [[TMP13]], align 4 +; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i32( [[WIDE_VEC8]]) +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC9]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC9]], 1 +; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP14]]) +; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP15]]) +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC10]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC11]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC10]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC11]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add nsw [[TMP16]], [[TMP9]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP22:%.*]] = sub nsw [[TMP10]], [[TMP17]] +; CHECK-NEXT: [[TMP23:%.*]] = shl [[TMP11]], [[TMP18]] +; CHECK-NEXT: [[TMP24:%.*]] = ashr [[TMP12]], [[TMP19]] +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP20]], [[TMP23]]) +; CHECK-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP22]], [[TMP24]]) +; CHECK-NEXT: [[INTERLEAVED_VEC13:%.*]] = call @llvm.vector.interleave2.nxv16i32( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC12]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC13]], ptr [[TMP21]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4 +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Y]], align 4 +; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4 +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[Y11]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP28]], [[TMP29]] +; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4 +; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[Z]], align 4 +; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8 +; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[Z19]], align 4 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP30]], [[TMP31]] +; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8 +; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4 +; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12 +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[T]], align 4 +; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12 +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[T27]], align 4 +; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP32]], [[TMP33]] +; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12 +; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv + store i32 %add, ptr %arrayidx5, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4 + %2 = load i32, ptr %y, align 4 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4 + %3 = load i32, ptr %y11, align 4 + %sub = sub nsw i32 %2, %3 + %y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4 + store i32 %sub, ptr %y14, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8 + %4 = load i32, ptr %z, align 4 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8 + %5 = load i32, ptr %z19, align 4 + %shl = shl i32 %4, %5 + %z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8 + store i32 %shl, ptr %z22, align 4 + %t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12 + %6 = load i32, ptr %t, align 4 + %t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12 + %7 = load i32, ptr %t27, align 4 + %shr = ashr i32 %6, %7 + %t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12 + store i32 %shr, ptr %t30, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + +; Check vectorization on a reverse interleaved load/store groups of factor 4 + +; for (int i = 1023; i >= 0; i--) { +; int a = A[i].x + i; +; int b = A[i].y - i; +; int c = A[i].z * i; +; int d = A[i].t << i; +; B[i].x = a; +; B[i].y = b; +; B[i].z = c; +; B[i].t = d; +; } + +define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) #1{ +; CHECK-LABEL: @interleave_deinterleave_reverse( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.stepvector.nxv4i32() +; CHECK-NEXT: [[INDUCTION:%.*]] = sub splat (i32 1023), [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = sub nsw i32 0, [[TMP3]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP4]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP10]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP11]]) +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 1 +; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP13]]) +; CHECK-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP14]]) +; CHECK-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP15]]) +; CHECK-NEXT: [[REVERSE5:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP16]]) +; CHECK-NEXT: [[TMP17:%.*]] = add nsw [[REVERSE]], [[VEC_IND]] +; CHECK-NEXT: [[TMP18:%.*]] = sub nsw [[REVERSE3]], [[VEC_IND]] +; CHECK-NEXT: [[TMP19:%.*]] = mul nsw [[REVERSE4]], [[VEC_IND]] +; CHECK-NEXT: [[TMP20:%.*]] = shl nuw nsw [[REVERSE5]], [[VEC_IND]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 4 +; CHECK-NEXT: [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]] +; CHECK-NEXT: [[REVERSE6:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP17]]) +; CHECK-NEXT: [[REVERSE7:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP18]]) +; CHECK-NEXT: [[REVERSE8:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP19]]) +; CHECK-NEXT: [[REVERSE9:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP20]]) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[REVERSE6]], [[REVERSE8]]) +; CHECK-NEXT: [[INTERLEAVED_VEC10:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[REVERSE7]], [[REVERSE9]]) +; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = call @llvm.vector.interleave2.nxv16i32( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC10]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC11]], ptr [[TMP26]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP44:![0-9]+]] +; +entry: + br label %for.body +for.cond.cleanup: ; preds = %for.body + ret void +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] + %x = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 0 + %load1 = load i32, ptr %x, align 4 + %trunc = trunc i64 %indvars.iv to i32 + %add = add nsw i32 %load1, %trunc + %y = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 1 + %load2 = load i32, ptr %y, align 4 + %sub = sub nsw i32 %load2, %trunc + %z = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 2 + %load3 = load i32, ptr %z, align 4 + %mul = mul nsw i32 %load3, %trunc + %t = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 3 + %load4 = load i32, ptr %t, align 4 + %shl = shl nuw nsw i32 %load4, %trunc + %x5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 0 + store i32 %add, ptr %x5, align 4 + %y8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 1 + store i32 %sub, ptr %y8, align 4 + %z5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 2 + store i32 %mul, ptr %z5, align 4 + %t8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 3 + store i32 %shl, ptr %t8, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +} attributes #1 = { "target-features"="+sve" vscale_range(1, 16) } attributes #0 = { "unsafe-fp-math"="true" "target-features"="+sve" vscale_range(1, 16) } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll index 1a281fe7c6f7f..d4392bebdf37b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll @@ -529,3 +529,255 @@ for.inc: for.end: ret void } + +; Expected to contain interleave2/deinterleave2 instructions +; +; void masked_strided_factor4(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard) { +; char left1 = p[4*ix]; +; char right1 = p[4*ix + 1]; +; char left2 = p[4*ix + 2]; +; char right2 = p[4*ix + 3]; +; char max1 = max(left1, right1); +; char max2 = max(left2, right2); +; q[4*ix] = max1; +; q[4*ix + 1] = 0 - max1; +; q[4*ix + 2] = max2; +; q[4*ix + 3] = 0 - max2; +; } +; } +;} +define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 { +; SCALAR_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4 +; SCALAR_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SCALAR_TAIL_FOLDING-NEXT: entry: +; SCALAR_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALAR_TAIL_FOLDING: vector.ph: +; SCALAR_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]] +; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv16i32() +; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALAR_TAIL_FOLDING: vector.body: +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 2 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK2:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK]], [[INTERLEAVED_MASK1]]) +; SCALAR_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK2]], poison) +; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv64i8( [[WIDE_MASKED_VEC]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP11]]) +; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP12]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = call @llvm.smax.nxv16i8( [[TMP13]], [[TMP14]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sub zeroinitializer, [[TMP17]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[TMP15]], [[TMP16]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = sext i32 [[TMP8]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]] +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP17]], [[TMP19]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC5:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP18]], [[TMP20]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC6:%.*]] = call @llvm.vector.interleave2.nxv64i8( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC5]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK7:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK8:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK9:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK7]], [[INTERLEAVED_MASK8]]) +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, [[INTERLEAVED_MASK9]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALAR_TAIL_FOLDING: middle.block: +; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; SCALAR_TAIL_FOLDING: scalar.ph: +; SCALAR_TAIL_FOLDING-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]] +; SCALAR_TAIL_FOLDING: for.body: +; SCALAR_TAIL_FOLDING-NEXT: [[IX_024:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_024]], [[CONV]] +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; SCALAR_TAIL_FOLDING: if.then: +; SCALAR_TAIL_FOLDING-NEXT: [[IDX0:%.*]] = shl nuw nsw i32 [[IX_024]], 2 +; SCALAR_TAIL_FOLDING-NEXT: [[IDX1:%.*]] = or disjoint i32 [[IDX0]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[IDX2:%.*]] = or disjoint i32 [[IDX0]], 2 +; SCALAR_TAIL_FOLDING-NEXT: [[IDX3:%.*]] = or disjoint i32 [[IDX0]], 3 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP24:%.*]] = zext nneg i32 [[IDX0]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP24]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP25:%.*]] = load i8, ptr [[ARRAY1IDX0]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP26:%.*]] = zext nneg i32 [[IDX1]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP26]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP27:%.*]] = load i8, ptr [[ARRAY1IDX1]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP28:%.*]] = zext nneg i32 [[IDX2]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP28]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAY1IDX2]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP30:%.*]] = zext nneg i32 [[IDX3]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP30]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP31:%.*]] = load i8, ptr [[ARRAY1IDX3]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[SPEC_SELECT_I1:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP25]], i8 [[TMP27]]) +; SCALAR_TAIL_FOLDING-NEXT: [[SUB1:%.*]] = sub i8 0, [[SPEC_SELECT_I1]] +; SCALAR_TAIL_FOLDING-NEXT: [[SPEC_SELECT_I2:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP29]], i8 [[TMP31]]) +; SCALAR_TAIL_FOLDING-NEXT: [[SUB2:%.*]] = sub i8 0, [[SPEC_SELECT_I2]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP32:%.*]] = zext nneg i32 [[IDX0]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP32]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SPEC_SELECT_I1]], ptr [[ARRAY3IDX0]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP33:%.*]] = zext nneg i32 [[IDX1]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP33]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SUB1]], ptr [[ARRAY3IDX1]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP34:%.*]] = zext nneg i32 [[IDX2]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP34]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SPEC_SELECT_I2]], ptr [[ARRAY3IDX2]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP35:%.*]] = zext nneg i32 [[IDX3]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP35]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SUB2]], ptr [[ARRAY3IDX3]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_INC]] +; SCALAR_TAIL_FOLDING: for.inc: +; SCALAR_TAIL_FOLDING-NEXT: [[INC]] = add nuw nsw i32 [[IX_024]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1024 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; SCALAR_TAIL_FOLDING: for.end: +; SCALAR_TAIL_FOLDING-NEXT: ret void +; +; PREDICATED_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4 +; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] { +; PREDICATED_TAIL_FOLDING-NEXT: entry: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; PREDICATED_TAIL_FOLDING: vector.ph: +; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv16i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] +; PREDICATED_TAIL_FOLDING: vector.body: +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 2 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK2:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK]], [[INTERLEAVED_MASK1]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK2]], poison) +; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv64i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP11]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP12]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = call @llvm.smax.nxv16i8( [[TMP13]], [[TMP14]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sub zeroinitializer, [[TMP17]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[TMP15]], [[TMP16]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = sext i32 [[TMP8]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP17]], [[TMP19]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC5:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP18]], [[TMP20]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC6:%.*]] = call @llvm.vector.interleave2.nxv64i8( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC5]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK7:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK8:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK9:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK7]], [[INTERLEAVED_MASK8]]) +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, [[INTERLEAVED_MASK9]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]] +; PREDICATED_TAIL_FOLDING: middle.block: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; PREDICATED_TAIL_FOLDING: scalar.ph: +; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]] +; PREDICATED_TAIL_FOLDING: for.body: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; PREDICATED_TAIL_FOLDING: if.then: +; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_INC]] +; PREDICATED_TAIL_FOLDING: for.inc: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; PREDICATED_TAIL_FOLDING: for.end: +; PREDICATED_TAIL_FOLDING-NEXT: ret void +; +entry: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp1 = icmp ugt i32 %ix.024, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %idx0 = shl nuw nsw i32 %ix.024, 2 + %idx1 = add i32 %idx0, 1 + %idx2 = add i32 %idx0, 2 + %idx3 = add i32 %idx0, 3 + + %array1idx0 = getelementptr inbounds i8, ptr %p, i32 %idx0 + %0 = load i8, ptr %array1idx0, align 1 + %array1idx1 = getelementptr inbounds i8, ptr %p, i32 %idx1 + %1 = load i8, ptr %array1idx1, align 1 + %array1idx2 = getelementptr inbounds i8, ptr %p, i32 %idx2 + %2 = load i8, ptr %array1idx2, align 1 + %array1idx3 = getelementptr inbounds i8, ptr %p, i32 %idx3 + %3 = load i8, ptr %array1idx3, align 1 + + %cmp.i1 = icmp slt i8 %0, %1 + %spec.select.i1 = select i1 %cmp.i1, i8 %1, i8 %0 + %sub1 = sub i8 0, %spec.select.i1 + %cmp.i2 = icmp slt i8 %2, %3 + %spec.select.i2 = select i1 %cmp.i2, i8 %3, i8 %2 + %sub2 = sub i8 0, %spec.select.i2 + + %array3idx0 = getelementptr inbounds i8, ptr %q, i32 %idx0 + store i8 %spec.select.i1, ptr %array3idx0, align 1 + %array3idx1 = getelementptr inbounds i8, ptr %q, i32 %idx1 + store i8 %sub1, ptr %array3idx1, align 1 + %array3idx2 = getelementptr inbounds i8, ptr %q, i32 %idx2 + store i8 %spec.select.i2, ptr %array3idx2, align 1 + %array3idx3 = getelementptr inbounds i8, ptr %q, i32 %idx3 + store i8 %sub2, ptr %array3idx3, align 1 + + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.024, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll index bda4839dead51..b1ff589fe51bf 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll @@ -9,7 +9,7 @@ define void @load_store_factor2_i32(ptr %p) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -17,88 +17,88 @@ define void @load_store_factor2_i32(ptr %p) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 1) -; CHECK-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i32 2) -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP12]], [[TMP15]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i32 1) +; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 2) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP11]], [[TMP12]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; CHECK-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i32 [[Y0]], ptr [[Q2]], align 4 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; CHECK-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 ; CHECK-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor2_i32( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP2]], align 4 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; FIXED-NEXT: [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; FIXED-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> -; FIXED-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; FIXED-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> +; FIXED-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; FIXED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 ; FIXED-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; FIXED-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i32 [[Y0]], ptr [[Q2]], align 4 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; FIXED-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 ; FIXED-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -107,7 +107,7 @@ define void @load_store_factor2_i32(ptr %p) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -115,44 +115,44 @@ define void @load_store_factor2_i32(ptr %p) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 1) -; SCALABLE-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i32 2) -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP12]], [[TMP15]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i32 1) +; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 2) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP11]], [[TMP12]]) +; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 ; SCALABLE-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q2]], align 4 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; SCALABLE-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 ; SCALABLE-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -186,7 +186,7 @@ define void @load_store_factor2_i64(ptr %p) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -194,88 +194,88 @@ define void @load_store_factor2_i64(ptr %p) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 1) -; CHECK-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i64 2) -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP12]], [[TMP15]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i64 1) +; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 2) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP11]], [[TMP12]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q2]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 ; CHECK-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor2_i64( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP2]], align 8 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; FIXED-NEXT: [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; FIXED-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> -; FIXED-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; FIXED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; FIXED-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> +; FIXED-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; FIXED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 ; FIXED-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; FIXED-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i64 [[Y0]], ptr [[Q2]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 ; FIXED-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -284,7 +284,7 @@ define void @load_store_factor2_i64(ptr %p) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -292,44 +292,44 @@ define void @load_store_factor2_i64(ptr %p) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 1) -; SCALABLE-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i64 2) -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP12]], [[TMP15]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i64 1) +; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 2) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP11]], [[TMP12]]) +; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 ; SCALABLE-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q2]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 ; SCALABLE-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -360,42 +360,42 @@ exit: define void @load_store_factor3_i32(ptr %p) { ; CHECK-LABEL: @load_store_factor3_i32( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> -; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; CHECK-NEXT: [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> +; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; CHECK-NEXT: [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]] +; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q3]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; CHECK-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i32 [[Y0]], ptr [[Q3]], align 4 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; CHECK-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 @@ -405,50 +405,50 @@ define void @load_store_factor3_i32(ptr %p) { ; CHECK-NEXT: [[X2:%.*]] = load i32, ptr [[Q2]], align 4 ; CHECK-NEXT: [[Y2:%.*]] = add i32 [[X2]], 3 ; CHECK-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor3_i32( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; FIXED-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; FIXED-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) -; FIXED-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> -; FIXED-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> -; FIXED-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> -; FIXED-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; FIXED-NEXT: [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) +; FIXED-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; FIXED-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> +; FIXED-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> +; FIXED-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; FIXED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; FIXED-NEXT: [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]] +; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q3]], align 4 ; FIXED-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; FIXED-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i32 [[Y0]], ptr [[Q3]], align 4 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; FIXED-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 @@ -458,50 +458,50 @@ define void @load_store_factor3_i32(ptr %p) { ; FIXED-NEXT: [[X2:%.*]] = load i32, ptr [[Q2]], align 4 ; FIXED-NEXT: [[Y2:%.*]] = add i32 [[X2]], 3 ; FIXED-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; ; SCALABLE-LABEL: @load_store_factor3_i32( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; SCALABLE-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> -; SCALABLE-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; SCALABLE-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; SCALABLE-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) -; SCALABLE-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> -; SCALABLE-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> -; SCALABLE-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> -; SCALABLE-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; SCALABLE-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; SCALABLE-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; SCALABLE-NEXT: [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) +; SCALABLE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; SCALABLE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> +; SCALABLE-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> +; SCALABLE-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; SCALABLE-NEXT: [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]] +; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q3]], align 4 ; SCALABLE-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q3]], align 4 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; SCALABLE-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 @@ -511,9 +511,9 @@ define void @load_store_factor3_i32(ptr %p) { ; SCALABLE-NEXT: [[X2:%.*]] = load i32, ptr [[Q2]], align 4 ; SCALABLE-NEXT: [[Y2:%.*]] = add i32 [[X2]], 3 ; SCALABLE-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -550,42 +550,42 @@ exit: define void @load_store_factor3_i64(ptr %p) { ; CHECK-LABEL: @load_store_factor3_i64( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> -; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> +; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; CHECK-NEXT: [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q3]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q3]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -595,50 +595,50 @@ define void @load_store_factor3_i64(ptr %p) { ; CHECK-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 8 ; CHECK-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3 ; CHECK-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor3_i64( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; FIXED-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; FIXED-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) -; FIXED-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> -; FIXED-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> -; FIXED-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> -; FIXED-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; FIXED-NEXT: [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) +; FIXED-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; FIXED-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> +; FIXED-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> +; FIXED-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; FIXED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; FIXED-NEXT: [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q3]], align 8 ; FIXED-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; FIXED-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i64 [[Y0]], ptr [[Q3]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -648,50 +648,50 @@ define void @load_store_factor3_i64(ptr %p) { ; FIXED-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 8 ; FIXED-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3 ; FIXED-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; ; SCALABLE-LABEL: @load_store_factor3_i64( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; SCALABLE-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> -; SCALABLE-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; SCALABLE-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; SCALABLE-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) -; SCALABLE-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> -; SCALABLE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> -; SCALABLE-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> -; SCALABLE-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALABLE-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; SCALABLE-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; SCALABLE-NEXT: [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) +; SCALABLE-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; SCALABLE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> +; SCALABLE-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> +; SCALABLE-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; SCALABLE-NEXT: [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q3]], align 8 ; SCALABLE-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q3]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -701,9 +701,9 @@ define void @load_store_factor3_i64(ptr %p) { ; SCALABLE-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 8 ; SCALABLE-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3 ; SCALABLE-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -740,56 +740,75 @@ exit: define void @load_store_factor8(ptr %p) { ; CHECK-LABEL: @load_store_factor8( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) -; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) -; CHECK-NEXT: [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) -; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) -; CHECK-NEXT: [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) -; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> -; CHECK-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i64( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP6]]) +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP7]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 1 +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP8]]) +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP9]]) +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP10]]) +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP11]]) +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add [[TMP12]], splat (i64 1) +; CHECK-NEXT: [[TMP21:%.*]] = add [[TMP13]], splat (i64 2) +; CHECK-NEXT: [[TMP22:%.*]] = add [[TMP14]], splat (i64 3) +; CHECK-NEXT: [[TMP23:%.*]] = add [[TMP15]], splat (i64 4) +; CHECK-NEXT: [[TMP24:%.*]] = add [[TMP16]], splat (i64 5) +; CHECK-NEXT: [[TMP25:%.*]] = add [[TMP17]], splat (i64 6) +; CHECK-NEXT: [[TMP26:%.*]] = add [[TMP18]], splat (i64 7) +; CHECK-NEXT: [[TMP27:%.*]] = add [[TMP19]], splat (i64 8) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP20]], [[TMP24]]) +; CHECK-NEXT: [[INTERLEAVED_VEC7:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP21]], [[TMP25]]) +; CHECK-NEXT: [[INTERLEAVED_VEC8:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP22]], [[TMP26]]) +; CHECK-NEXT: [[INTERLEAVED_VEC9:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP23]], [[TMP27]]) +; CHECK-NEXT: [[INTERLEAVED_VEC10:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC8]]) +; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC7]], [[INTERLEAVED_VEC9]]) +; CHECK-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave2.nxv8i64( [[INTERLEAVED_VEC10]], [[INTERLEAVED_VEC11]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP2]] +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET8:%.*]] = shl i64 [[I1]], 3 +; CHECK-NEXT: [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q8]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q8]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -824,23 +843,23 @@ define void @load_store_factor8(ptr %p) { ; CHECK-NEXT: [[X7:%.*]] = load i64, ptr [[Q7]], align 8 ; CHECK-NEXT: [[Y7:%.*]] = add i64 [[X7]], 8 ; CHECK-NEXT: store i64 [[Y7]], ptr [[Q7]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor8( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 3 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> @@ -849,39 +868,39 @@ define void @load_store_factor8(ptr %p) { ; FIXED-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) -; FIXED-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) -; FIXED-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) -; FIXED-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) -; FIXED-NEXT: [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) -; FIXED-NEXT: [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) -; FIXED-NEXT: [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) -; FIXED-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) -; FIXED-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> -; FIXED-NEXT: [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> -; FIXED-NEXT: [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> -; FIXED-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> -; FIXED-NEXT: [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> -; FIXED-NEXT: [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> -; FIXED-NEXT: [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> -; FIXED-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; FIXED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) +; FIXED-NEXT: [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) +; FIXED-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) +; FIXED-NEXT: [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) +; FIXED-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) +; FIXED-NEXT: [[TMP9:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) +; FIXED-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) +; FIXED-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> +; FIXED-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; FIXED-NEXT: [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> [[TMP8]], <4 x i32> +; FIXED-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <4 x i32> +; FIXED-NEXT: [[TMP15:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP12]], <8 x i32> +; FIXED-NEXT: [[TMP16:%.*]] = shufflevector <4 x i64> [[TMP13]], <4 x i64> [[TMP14]], <8 x i32> +; FIXED-NEXT: [[TMP17:%.*]] = shufflevector <8 x i64> [[TMP15]], <8 x i64> [[TMP16]], <16 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP17]], <16 x i64> poison, <16 x i32> +; FIXED-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 2 +; FIXED-NEXT: [[TMP18:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET8:%.*]] = shl i64 [[I1]], 3 +; FIXED-NEXT: [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q8]], align 8 ; FIXED-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; FIXED-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i64 [[Y0]], ptr [[Q8]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -916,64 +935,83 @@ define void @load_store_factor8(ptr %p) { ; FIXED-NEXT: [[X7:%.*]] = load i64, ptr [[Q7]], align 8 ; FIXED-NEXT: [[Y7:%.*]] = add i64 [[X7]], 8 ; FIXED-NEXT: store i64 [[Y7]], ptr [[Q7]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; ; SCALABLE-LABEL: @load_store_factor8( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 -; SCALABLE-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8 -; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) -; SCALABLE-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) -; SCALABLE-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) -; SCALABLE-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) -; SCALABLE-NEXT: [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) -; SCALABLE-NEXT: [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) -; SCALABLE-NEXT: [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) -; SCALABLE-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) -; SCALABLE-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> -; SCALABLE-NEXT: [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> -; SCALABLE-NEXT: [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> -; SCALABLE-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> -; SCALABLE-NEXT: [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> -; SCALABLE-NEXT: [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> -; SCALABLE-NEXT: [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> -; SCALABLE-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; SCALABLE-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; SCALABLE-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP3:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i64( [[WIDE_VEC]]) +; SCALABLE-NEXT: [[TMP6:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP6]]) +; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP7]]) +; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 0 +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 1 +; SCALABLE-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP8]]) +; SCALABLE-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP9]]) +; SCALABLE-NEXT: [[STRIDED_VEC5:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP10]]) +; SCALABLE-NEXT: [[STRIDED_VEC6:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP11]]) +; SCALABLE-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; SCALABLE-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; SCALABLE-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 0 +; SCALABLE-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 0 +; SCALABLE-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; SCALABLE-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; SCALABLE-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 1 +; SCALABLE-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 1 +; SCALABLE-NEXT: [[TMP20:%.*]] = add [[TMP12]], splat (i64 1) +; SCALABLE-NEXT: [[TMP21:%.*]] = add [[TMP13]], splat (i64 2) +; SCALABLE-NEXT: [[TMP22:%.*]] = add [[TMP14]], splat (i64 3) +; SCALABLE-NEXT: [[TMP23:%.*]] = add [[TMP15]], splat (i64 4) +; SCALABLE-NEXT: [[TMP24:%.*]] = add [[TMP16]], splat (i64 5) +; SCALABLE-NEXT: [[TMP25:%.*]] = add [[TMP17]], splat (i64 6) +; SCALABLE-NEXT: [[TMP26:%.*]] = add [[TMP18]], splat (i64 7) +; SCALABLE-NEXT: [[TMP27:%.*]] = add [[TMP19]], splat (i64 8) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP20]], [[TMP24]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC7:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP21]], [[TMP25]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC8:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP22]], [[TMP26]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC9:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP23]], [[TMP27]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC10:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC8]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC11:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC7]], [[INTERLEAVED_VEC9]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave2.nxv8i64( [[INTERLEAVED_VEC10]], [[INTERLEAVED_VEC11]]) +; SCALABLE-NEXT: store [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP2]] +; SCALABLE-NEXT: [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET8:%.*]] = shl i64 [[I1]], 3 +; SCALABLE-NEXT: [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q8]], align 8 ; SCALABLE-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q8]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -1008,9 +1046,9 @@ define void @load_store_factor8(ptr %p) { ; SCALABLE-NEXT: [[X7:%.*]] = load i64, ptr [[Q7]], align 8 ; SCALABLE-NEXT: [[Y7:%.*]] = add i64 [[X7]], 8 ; SCALABLE-NEXT: store i64 [[Y7]], ptr [[Q7]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -1080,7 +1118,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -1088,94 +1126,94 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0 -; CHECK-NEXT: store [[TMP12]], ptr [[TMP14]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; CHECK-NEXT: store [[TMP11]], ptr [[TMP13]], align 4 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; CHECK-NEXT: [[RES:%.*]] = add i32 [[X0]], [[X1]] -; CHECK-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]] +; CHECK-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]] ; CHECK-NEXT: store i32 [[RES]], ptr [[DST]], align 4 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @combine_load_factor2_i32( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[I]], 8 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 ; FIXED-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1 -; FIXED-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP2]] +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP3]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4 +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4 -; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[TMP8:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]] -; FIXED-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]] -; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]] -; FIXED-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 -; FIXED-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 8 -; FIXED-NEXT: store <8 x i32> [[TMP8]], ptr [[TMP12]], align 4 -; FIXED-NEXT: store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; FIXED-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4 +; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> +; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> +; FIXED-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]] +; FIXED-NEXT: [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC4]] +; FIXED-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]] +; FIXED-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 +; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 8 +; FIXED-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP9]], align 4 +; FIXED-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP10]], align 4 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 16 +; FIXED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; FIXED-NEXT: [[RES:%.*]] = add i32 [[X0]], [[X1]] -; FIXED-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]] +; FIXED-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]] ; FIXED-NEXT: store i32 [[RES]], ptr [[DST]], align 4 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -1184,7 +1222,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -1192,43 +1230,43 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0 -; SCALABLE-NEXT: store [[TMP12]], ptr [[TMP14]], align 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] +; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; SCALABLE-NEXT: store [[TMP11]], ptr [[TMP13]], align 4 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; SCALABLE-NEXT: [[RES:%.*]] = add i32 [[X0]], [[X1]] -; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]] +; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]] ; SCALABLE-NEXT: store i32 [[RES]], ptr [[DST]], align 4 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -1263,7 +1301,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -1271,94 +1309,94 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0 -; CHECK-NEXT: store [[TMP12]], ptr [[TMP14]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0 +; CHECK-NEXT: store [[TMP11]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[RES:%.*]] = add i64 [[X0]], [[X1]] -; CHECK-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]] +; CHECK-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]] ; CHECK-NEXT: store i64 [[RES]], ptr [[DST]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @combine_load_factor2_i64( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 -; FIXED-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[I]], 4 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 ; FIXED-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1 -; FIXED-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]] +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]] -; FIXED-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]] -; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]] -; FIXED-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0 -; FIXED-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP10]], i32 4 -; FIXED-NEXT: store <4 x i64> [[TMP8]], ptr [[TMP12]], align 8 -; FIXED-NEXT: store <4 x i64> [[TMP9]], ptr [[TMP13]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> +; FIXED-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> +; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> +; FIXED-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC1]] +; FIXED-NEXT: [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC3]], [[STRIDED_VEC4]] +; FIXED-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]] +; FIXED-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0 +; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP8]], i32 4 +; FIXED-NEXT: store <4 x i64> [[TMP6]], ptr [[TMP9]], align 8 +; FIXED-NEXT: store <4 x i64> [[TMP7]], ptr [[TMP10]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; FIXED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[RES:%.*]] = add i64 [[X0]], [[X1]] -; FIXED-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]] +; FIXED-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]] ; FIXED-NEXT: store i64 [[RES]], ptr [[DST]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -1367,7 +1405,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -1375,43 +1413,43 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0 -; SCALABLE-NEXT: store [[TMP12]], ptr [[TMP14]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] +; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0 +; SCALABLE-NEXT: store [[TMP11]], ptr [[TMP13]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[RES:%.*]] = add i64 [[X0]], [[X1]] -; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]] +; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]] ; SCALABLE-NEXT: store i64 [[RES]], ptr [[DST]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll new file mode 100644 index 0000000000000..362ec22600f92 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll @@ -0,0 +1,135 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize,interleaved-access -mattr=+sve -S -o - %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +%struct.xyzt = type { i32, i32, i32, i32 } +; for (int i = 0; i < 1024; ++i) { +; dst[i].x = a[i].x + b[i].x; +; dst[i].y = a[i].y - b[i].y; +; dst[i].z = a[i].z << b[i].z; +; dst[i].t = a[i].t >> b[i].t; +; } + +define void @interleave_deinterleave(ptr noalias %dst, ptr %a, ptr %b) { +; CHECK-LABEL: @interleave_deinterleave( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( splat (i1 true), ptr [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[LDN]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[LDN]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[LDN9:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( splat (i1 true), ptr [[TMP13]]) +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , , , } [[LDN9]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , , , } [[LDN9]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , , , } [[LDN9]], 2 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , , , } [[LDN9]], 3 +; CHECK-NEXT: [[TMP20:%.*]] = add nsw [[TMP16]], [[TMP9]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP22:%.*]] = sub nsw [[TMP10]], [[TMP17]] +; CHECK-NEXT: [[TMP23:%.*]] = shl [[TMP11]], [[TMP18]] +; CHECK-NEXT: [[TMP24:%.*]] = ashr [[TMP12]], [[TMP19]] +; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP20]], [[TMP22]], [[TMP23]], [[TMP24]], splat (i1 true), ptr [[TMP21]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP32]], [[TMP31]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4 +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[Y]], align 4 +; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4 +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[Y11]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP33]], [[TMP26]] +; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4 +; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8 +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[Z]], align 4 +; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8 +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Z19]], align 4 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP27]], [[TMP28]] +; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8 +; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4 +; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12 +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[T]], align 4 +; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[T27]], align 4 +; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP29]], [[TMP30]] +; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12 +; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %gep.a = getelementptr inbounds %struct.xyzt, ptr %a, i64 %iv + %a.0 = load i32, ptr %gep.a, align 4 + %gep.b = getelementptr inbounds %struct.xyzt, ptr %b, i64 %iv + %b.0 = load i32, ptr %gep.b, align 4 + %add = add nsw i32 %b.0, %a.0 + %gep.dst = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %iv + store i32 %add, ptr %gep.dst, align 4 + %gep.a.1 = getelementptr inbounds nuw i8, ptr %gep.a, i64 4 + %a.1 = load i32, ptr %gep.a.1, align 4 + %gep.b.1 = getelementptr inbounds nuw i8, ptr %gep.b, i64 4 + %b.1 = load i32, ptr %gep.b.1, align 4 + %sub = sub nsw i32 %a.1, %b.1 + %gep.dst.1 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 4 + store i32 %sub, ptr %gep.dst.1, align 4 + %gep.a.2 = getelementptr inbounds nuw i8, ptr %gep.a, i64 8 + %a.2 = load i32, ptr %gep.a.2, align 4 + %gep.b.2 = getelementptr inbounds nuw i8, ptr %gep.b, i64 8 + %b.2 = load i32, ptr %gep.b.2, align 4 + %shl = shl i32 %a.2, %b.2 + %gep.dst.2 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 8 + store i32 %shl, ptr %gep.dst.2, align 4 + %gep.a.3 = getelementptr inbounds nuw i8, ptr %gep.a, i64 12 + %a.3 = load i32, ptr %gep.a.3, align 4 + %gep.b.3 = getelementptr inbounds nuw i8, ptr %gep.b, i64 12 + %b.3 = load i32, ptr %gep.b.3, align 4 + %shr = ashr i32 %a.3, %b.3 + %gep.dst.3 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 12 + store i32 %shr, ptr %gep.dst.3, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} From b068f2fd0fefca1ee357483333f034d18e6d8214 Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Fri, 17 Jan 2025 11:36:12 +0100 Subject: [PATCH 23/45] [LLD][COFF] Process bitcode files separately for each symbol table on ARM64X (#123194) --- lld/COFF/COFFLinkerContext.h | 1 - lld/COFF/Driver.cpp | 41 ++++++++++++++++--------------- lld/COFF/InputFiles.cpp | 19 ++++++++++----- lld/COFF/InputFiles.h | 14 ++++++++--- lld/COFF/SymbolTable.cpp | 17 ++++++------- lld/COFF/SymbolTable.h | 8 ++++++ lld/test/COFF/lto-arm64x.ll | 47 ++++++++++++++++++++++++++++++++++++ 7 files changed, 108 insertions(+), 39 deletions(-) create mode 100644 lld/test/COFF/lto-arm64x.ll diff --git a/lld/COFF/COFFLinkerContext.h b/lld/COFF/COFFLinkerContext.h index bdd625b8c3916..8322f829d4055 100644 --- a/lld/COFF/COFFLinkerContext.h +++ b/lld/COFF/COFFLinkerContext.h @@ -56,7 +56,6 @@ class COFFLinkerContext : public CommonLinkerContext { std::vector objFileInstances; std::map pdbInputFileInstances; std::vector importFileInstances; - std::vector bitcodeFileInstances; MergeChunk *mergeChunkInstances[Log2MaxSectionAlignment + 1] = {}; diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index 8b1a8dc3e5af7..898c6c17d2062 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -218,7 +218,7 @@ void LinkerDriver::addFile(InputFile *file) { << " linked in after " "doing LTO compilation."; } - ctx.bitcodeFileInstances.push_back(f); + f->symtab.bitcodeFileInstances.push_back(f); } else if (auto *f = dyn_cast(file)) { ctx.importFileInstances.push_back(f); } @@ -285,7 +285,7 @@ void LinkerDriver::addBuffer(std::unique_ptr mb, addFile(make(ctx, mbref)); break; case file_magic::bitcode: - addFile(make(ctx, mbref, "", 0, lazy)); + addFile(BitcodeFile::create(ctx, mbref, "", 0, lazy)); break; case file_magic::coff_object: case file_magic::coff_import_library: @@ -374,8 +374,8 @@ void LinkerDriver::addArchiveBuffer(MemoryBufferRef mb, StringRef symName, if (magic == file_magic::coff_object) { obj = ObjFile::create(ctx, mb); } else if (magic == file_magic::bitcode) { - obj = - make(ctx, mb, parentName, offsetInArchive, /*lazy=*/false); + obj = BitcodeFile::create(ctx, mb, parentName, offsetInArchive, + /*lazy=*/false); } else if (magic == file_magic::coff_cl_gl_object) { Err(ctx) << mb.getBufferIdentifier() << ": is not a native COFF file. Recompile without /GL?"; @@ -2571,19 +2571,19 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { } } - // If any inputs are bitcode files, the LTO code generator may create - // references to library functions that are not explicit in the bitcode - // file's symbol table. If any of those library functions are defined in a - // bitcode file in an archive member, we need to arrange to use LTO to - // compile those archive members by adding them to the link beforehand. - if (!ctx.bitcodeFileInstances.empty()) { - llvm::Triple TT( - ctx.bitcodeFileInstances.front()->obj->getTargetTriple()); - for (auto *s : lto::LTO::getRuntimeLibcallSymbols(TT)) - ctx.symtab.addLibcall(s); - } - ctx.forEachSymtab([&](SymbolTable &symtab) { + // If any inputs are bitcode files, the LTO code generator may create + // references to library functions that are not explicit in the bitcode + // file's symbol table. If any of those library functions are defined in + // a bitcode file in an archive member, we need to arrange to use LTO to + // compile those archive members by adding them to the link beforehand. + if (!symtab.bitcodeFileInstances.empty()) { + llvm::Triple TT( + symtab.bitcodeFileInstances.front()->obj->getTargetTriple()); + for (auto *s : lto::LTO::getRuntimeLibcallSymbols(TT)) + symtab.addLibcall(s); + } + // Windows specific -- if __load_config_used can be resolved, resolve // it. if (symtab.findUnderscore("_load_config_used")) @@ -2639,8 +2639,11 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { // If we are going to do codegen for link-time optimization, check for // unresolvable symbols first, so we don't spend time generating code that // will fail to link anyway. - if (!ctx.bitcodeFileInstances.empty() && !config->forceUnresolved) - ctx.symtab.reportUnresolvable(); + if (!config->forceUnresolved) + ctx.forEachSymtab([](SymbolTable &symtab) { + if (!symtab.bitcodeFileInstances.empty()) + symtab.reportUnresolvable(); + }); if (errorCount()) return; @@ -2655,7 +2658,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { // link those files (unless -thinlto-index-only was given, in which case we // resolve symbols and write indices, but don't generate native code or link). ltoCompilationDone = true; - ctx.symtab.compileBitcodeFiles(); + ctx.forEachSymtab([](SymbolTable &symtab) { symtab.compileBitcodeFiles(); }); if (Defined *d = dyn_cast_or_null(ctx.symtab.findUnderscore("_tls_used"))) diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp index 66641ff9dcc1f..5ee73d4dc4f8b 100644 --- a/lld/COFF/InputFiles.cpp +++ b/lld/COFF/InputFiles.cpp @@ -1229,10 +1229,15 @@ void ImportFile::parse() { } } -BitcodeFile::BitcodeFile(COFFLinkerContext &ctx, MemoryBufferRef mb, - StringRef archiveName, uint64_t offsetInArchive, - bool lazy) - : InputFile(ctx.symtab, BitcodeKind, mb, lazy) { +BitcodeFile::BitcodeFile(SymbolTable &symtab, MemoryBufferRef mb, + std::unique_ptr &o, bool lazy) + : InputFile(symtab, BitcodeKind, mb, lazy) { + obj.swap(o); +} + +BitcodeFile *BitcodeFile::create(COFFLinkerContext &ctx, MemoryBufferRef mb, + StringRef archiveName, + uint64_t offsetInArchive, bool lazy) { std::string path = mb.getBufferIdentifier().str(); if (ctx.config.thinLTOIndexOnly) path = replaceThinLTOSuffix(mb.getBufferIdentifier(), @@ -1252,7 +1257,9 @@ BitcodeFile::BitcodeFile(COFFLinkerContext &ctx, MemoryBufferRef mb, sys::path::filename(path) + utostr(offsetInArchive))); - obj = check(lto::InputFile::create(mbref)); + std::unique_ptr obj = check(lto::InputFile::create(mbref)); + return make(ctx.getSymtab(getMachineType(obj.get())), mb, obj, + lazy); } BitcodeFile::~BitcodeFile() = default; @@ -1329,7 +1336,7 @@ void BitcodeFile::parseLazy() { } } -MachineTypes BitcodeFile::getMachineType() const { +MachineTypes BitcodeFile::getMachineType(const llvm::lto::InputFile *obj) { Triple t(obj->getTargetTriple()); switch (t.getArch()) { case Triple::x86_64: diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h index d3075c5e0a338..823561cda247a 100644 --- a/lld/COFF/InputFiles.h +++ b/lld/COFF/InputFiles.h @@ -386,13 +386,19 @@ class ImportFile : public InputFile { // Used for LTO. class BitcodeFile : public InputFile { public: - explicit BitcodeFile(COFFLinkerContext &ctx, MemoryBufferRef mb, - StringRef archiveName, uint64_t offsetInArchive, - bool lazy); + explicit BitcodeFile(SymbolTable &symtab, MemoryBufferRef mb, + std::unique_ptr &obj, bool lazy); ~BitcodeFile(); + + static BitcodeFile *create(COFFLinkerContext &ctx, MemoryBufferRef mb, + StringRef archiveName, uint64_t offsetInArchive, + bool lazy); static bool classof(const InputFile *f) { return f->kind() == BitcodeKind; } ArrayRef getSymbols() { return symbols; } - MachineTypes getMachineType() const override; + MachineTypes getMachineType() const override { + return getMachineType(obj.get()); + } + static MachineTypes getMachineType(const llvm::lto::InputFile *obj); void parseLazy(); std::unique_ptr obj; diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp index 36dcd0dfe1389..bf965e8a2332d 100644 --- a/lld/COFF/SymbolTable.cpp +++ b/lld/COFF/SymbolTable.cpp @@ -347,8 +347,8 @@ bool SymbolTable::handleMinGWAutomaticImport(Symbol *sym, StringRef name) { /// defined symbol imported" diagnostic for symbols in localImports. /// objFiles and bitcodeFiles (if not nullptr) are used to report where /// undefined symbols are referenced. -static void reportProblemSymbols( - COFFLinkerContext &ctx, const SmallPtrSetImpl &undefs, +void SymbolTable::reportProblemSymbols( + const SmallPtrSetImpl &undefs, const DenseMap *localImports, bool needBitcodeFiles) { // Return early if there is nothing to report (which should be // the common case). @@ -392,7 +392,7 @@ static void reportProblemSymbols( processFile(file, file->getSymbols()); if (needBitcodeFiles) - for (BitcodeFile *file : ctx.bitcodeFileInstances) + for (BitcodeFile *file : bitcodeFileInstances) processFile(file, file->getSymbols()); for (const UndefinedDiag &undefDiag : undefDiags) @@ -423,8 +423,7 @@ void SymbolTable::reportUnresolvable() { undefs.insert(sym); } - reportProblemSymbols(ctx, undefs, - /* localImports */ nullptr, true); + reportProblemSymbols(undefs, /*localImports=*/nullptr, true); } bool SymbolTable::resolveRemainingUndefines() { @@ -506,8 +505,8 @@ bool SymbolTable::resolveRemainingUndefines() { } reportProblemSymbols( - ctx, undefs, - ctx.config.warnLocallyDefinedImported ? &localImports : nullptr, false); + undefs, ctx.config.warnLocallyDefinedImported ? &localImports : nullptr, + false); return foundLazy; } @@ -1124,13 +1123,13 @@ Symbol *SymbolTable::addUndefined(StringRef name) { } void SymbolTable::compileBitcodeFiles() { - if (ctx.bitcodeFileInstances.empty()) + if (bitcodeFileInstances.empty()) return; llvm::TimeTraceScope timeScope("Compile bitcode"); ScopedTimer t(ctx.ltoTimer); lto.reset(new BitcodeCompiler(ctx)); - for (BitcodeFile *f : ctx.bitcodeFileInstances) + for (BitcodeFile *f : bitcodeFileInstances) lto->add(*f); for (InputFile *newObj : lto->compile()) { ObjFile *obj = cast(newObj); diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h index 9e316fcdbe630..66bca0d63e5ff 100644 --- a/lld/COFF/SymbolTable.h +++ b/lld/COFF/SymbolTable.h @@ -14,6 +14,7 @@ #include "llvm/ADT/CachedHashString.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/Support/raw_ostream.h" namespace llvm { @@ -155,6 +156,8 @@ class SymbolTable { callback(pair.second); } + std::vector bitcodeFileInstances; + DefinedRegular *loadConfigSym = nullptr; uint32_t loadConfigSize = 0; void initializeLoadConfig(); @@ -175,6 +178,11 @@ class SymbolTable { std::unique_ptr lto; std::vector> entryThunks; llvm::DenseMap exitThunks; + + void + reportProblemSymbols(const llvm::SmallPtrSetImpl &undefs, + const llvm::DenseMap *localImports, + bool needBitcodeFiles); }; std::vector getSymbolLocations(ObjFile *file, uint32_t symIndex); diff --git a/lld/test/COFF/lto-arm64x.ll b/lld/test/COFF/lto-arm64x.ll new file mode 100644 index 0000000000000..bbfc6b64c6fce --- /dev/null +++ b/lld/test/COFF/lto-arm64x.ll @@ -0,0 +1,47 @@ +; REQUIRES: aarch64, x86 +; RUN: split-file %s %t.dir && cd %t.dir + +; RUN: llvm-as arm64ec.ll -o arm64ec.obj +; RUN: llvm-as aarch64.ll -o aarch64.obj +; RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o loadconfig-arm64.obj +; RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj + +; RUN: lld-link -machine:arm64x aarch64.obj arm64ec.obj loadconfig-arm64.obj loadconfig-arm64ec.obj -out:out.exe -subsystem:console +; RUN: llvm-objdump -d out.exe | FileCheck %s + +; CHECK: 0000000140001000 <.text>: +; CHECK-NEXT: 140001000: 52800020 mov w0, #0x1 // =1 +; CHECK-NEXT: 140001004: d65f03c0 ret +; CHECK-NEXT: ... +; CHECK-NEXT: 140002000: 00000009 udf #0x9 +; CHECK-NEXT: 140002004: 52800040 mov w0, #0x2 // =2 +; CHECK-NEXT: 140002008: d65f03c0 ret + +; CHECK: 0000000140003000 <.hexpthk>: +; CHECK-NEXT: 140003000: 48 8b c4 movq %rsp, %rax +; CHECK-NEXT: 140003003: 48 89 58 20 movq %rbx, 0x20(%rax) +; CHECK-NEXT: 140003007: 55 pushq %rbp +; CHECK-NEXT: 140003008: 5d popq %rbp +; CHECK-NEXT: 140003009: e9 f6 ef ff ff jmp 0x140002004 <.text+0x1004> +; CHECK-NEXT: 14000300e: cc int3 +; CHECK-NEXT: 14000300f: cc int3 + +#--- arm64ec.ll + +target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "arm64ec-unknown-windows-msvc" + +define dso_local i32 @mainCRTStartup() { +entry: + ret i32 2 +} + +#--- aarch64.ll + +target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "aarch64-unknown-windows-msvc" + +define dso_local i32 @mainCRTStartup() { +entry: + ret i32 1 +} From 101109fc5460d5bb9bb597c6ec77f998093a6687 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 17 Jan 2025 11:40:34 +0100 Subject: [PATCH 24/45] [MLIR] Add missing include (NFC) Needed for libstdc++ 15 compatibility. --- mlir/include/mlir/Target/SPIRV/Deserialization.h | 1 + 1 file changed, 1 insertion(+) diff --git a/mlir/include/mlir/Target/SPIRV/Deserialization.h b/mlir/include/mlir/Target/SPIRV/Deserialization.h index e39258beeaac8..a346a7fd1e5f7 100644 --- a/mlir/include/mlir/Target/SPIRV/Deserialization.h +++ b/mlir/include/mlir/Target/SPIRV/Deserialization.h @@ -15,6 +15,7 @@ #include "mlir/IR/OwningOpRef.h" #include "mlir/Support/LLVM.h" +#include namespace mlir { class MLIRContext; From 831527a5ef63d24d056afc92509caf5ceb1d3682 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Fri, 17 Jan 2025 10:49:43 +0000 Subject: [PATCH 25/45] [FMV][GlobalOpt] Statically resolve calls to versioned functions. (#87939) To deduce whether the optimization is legal we need to compare the target features between caller and callee versions. The criteria for bypassing the resolver are the following: * If the callee's feature set is a subset of the caller's feature set, then the callee is a candidate for direct call. * Among such candidates the one of highest priority is the best match and it shall be picked, unless there is a version of the callee with higher priority than the best match which cannot be picked from a higher priority caller (directly or through the resolver). * For every higher priority callee version than the best match, there is a higher priority caller version whose feature set availability is implied by the callee's feature set. Example: Callers and Callees are ordered in decreasing priority. The arrows indicate successful call redirections. Caller Callee Explanation ========================================================================= mops+sve2 --+--> mops all the callee versions are subsets of the | caller but mops has the highest priority | mops --+ sve2 between mops and default callees, mops wins sve sve between sve and default callees, sve wins but sve2 does not have a high priority caller default -----> default sve (callee) implies sve (caller), sve2(callee) implies sve (caller), mops(callee) implies mops(caller) --- .../llvm/Analysis/TargetTransformInfo.h | 17 + .../llvm/Analysis/TargetTransformInfoImpl.h | 4 + .../llvm/TargetParser/AArch64TargetParser.h | 13 +- llvm/lib/Analysis/TargetTransformInfo.cpp | 8 + .../AArch64/AArch64TargetTransformInfo.cpp | 14 + .../AArch64/AArch64TargetTransformInfo.h | 4 + llvm/lib/TargetParser/AArch64TargetParser.cpp | 31 +- llvm/lib/Transforms/IPO/GlobalOpt.cpp | 162 ++++++++ .../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 365 ++++++++++++++++++ 9 files changed, 608 insertions(+), 10 deletions(-) create mode 100644 llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index fe13fc676e303..71b204f9c3fec 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1870,6 +1870,13 @@ class TargetTransformInfo { /// false, but it shouldn't matter what it returns anyway. bool hasArmWideBranch(bool Thumb) const; + /// Returns a bitmask constructed from the target-features or fmv-features + /// metadata of a function. + uint64_t getFeatureMask(const Function &F) const; + + /// Returns true if this is an instance of a function with multiple versions. + bool isMultiversionedFunction(const Function &F) const; + /// \return The maximum number of function arguments the target supports. unsigned getMaxNumArgs() const; @@ -2312,6 +2319,8 @@ class TargetTransformInfo::Concept { virtual VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; virtual bool hasArmWideBranch(bool Thumb) const = 0; + virtual uint64_t getFeatureMask(const Function &F) const = 0; + virtual bool isMultiversionedFunction(const Function &F) const = 0; virtual unsigned getMaxNumArgs() const = 0; virtual unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const = 0; @@ -3144,6 +3153,14 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.hasArmWideBranch(Thumb); } + uint64_t getFeatureMask(const Function &F) const override { + return Impl.getFeatureMask(F); + } + + bool isMultiversionedFunction(const Function &F) const override { + return Impl.isMultiversionedFunction(F); + } + unsigned getMaxNumArgs() const override { return Impl.getMaxNumArgs(); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 7ac3063ca9a37..dcef4a1abcfa3 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1039,6 +1039,10 @@ class TargetTransformInfoImplBase { bool hasArmWideBranch(bool) const { return false; } + uint64_t getFeatureMask(const Function &F) const { return 0; } + + bool isMultiversionedFunction(const Function &F) const { return false; } + unsigned getMaxNumArgs() const { return UINT_MAX; } unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const { diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h index 63f06a3a69298..0338770593bc4 100644 --- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h +++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h @@ -270,13 +270,16 @@ void fillValidCPUArchList(SmallVectorImpl &Values); bool isX18ReservedByDefault(const Triple &TT); -// Return the priority for a given set of FMV features. +// For a given set of feature names, which can be either target-features, or +// fmv-features metadata, expand their dependencies and then return a bitmask +// corresponding to the entries of AArch64::FeatPriorities. uint64_t getFMVPriority(ArrayRef Features); -// For given feature names, return a bitmask corresponding to the entries of -// AArch64::CPUFeatures. The values in CPUFeatures are not bitmasks themselves, -// they are sequential (0, 1, 2, 3, ...). The resulting bitmask is used at -// runtime to test whether a certain FMV feature is available on the host. +// For a given set of FMV feature names, expand their dependencies and then +// return a bitmask corresponding to the entries of AArch64::CPUFeatures. +// The values in CPUFeatures are not bitmasks themselves, they are sequential +// (0, 1, 2, 3, ...). The resulting bitmask is used at runtime to test whether +// a certain FMV feature is available on the host. uint64_t getCpuSupportsMask(ArrayRef Features); void PrintSupportedExtensions(); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index df42dc2746daf..8b9722d047edc 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1383,6 +1383,14 @@ bool TargetTransformInfo::hasArmWideBranch(bool Thumb) const { return TTIImpl->hasArmWideBranch(Thumb); } +uint64_t TargetTransformInfo::getFeatureMask(const Function &F) const { + return TTIImpl->getFeatureMask(F); +} + +bool TargetTransformInfo::isMultiversionedFunction(const Function &F) const { + return TTIImpl->isMultiversionedFunction(F); +} + unsigned TargetTransformInfo::getMaxNumArgs() const { return TTIImpl->getMaxNumArgs(); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 932a6f9ce23fd..7f10bfed739b4 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -23,6 +23,7 @@ #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" +#include "llvm/TargetParser/AArch64TargetParser.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include @@ -248,6 +249,19 @@ static bool hasPossibleIncompatibleOps(const Function *F) { return false; } +uint64_t AArch64TTIImpl::getFeatureMask(const Function &F) const { + StringRef AttributeStr = + isMultiversionedFunction(F) ? "fmv-features" : "target-features"; + StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString(); + SmallVector Features; + FeatureStr.split(Features, ","); + return AArch64::getFMVPriority(Features); +} + +bool AArch64TTIImpl::isMultiversionedFunction(const Function &F) const { + return F.hasFnAttribute("fmv-features"); +} + bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 8e7e590c173ff..1eb805ae00b1b 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -89,6 +89,10 @@ class AArch64TTIImpl : public BasicTTIImplBase { unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const; + uint64_t getFeatureMask(const Function &F) const; + + bool isMultiversionedFunction(const Function &F) const; + /// \name Scalar TTI Implementations /// @{ diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp index 34ca03a47e0a4..e13c6e6d28c2b 100644 --- a/llvm/lib/TargetParser/AArch64TargetParser.cpp +++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp @@ -48,12 +48,33 @@ std::optional AArch64::ArchInfo::findBySubArch(StringRef SubA return {}; } +std::optional lookupFMVByID(AArch64::ArchExtKind ExtID) { + for (const AArch64::FMVInfo &Info : AArch64::getFMVInfo()) + if (Info.ID && *Info.ID == ExtID) + return Info; + return {}; +} + uint64_t AArch64::getFMVPriority(ArrayRef Features) { - uint64_t Priority = 0; - for (StringRef Feature : Features) - if (std::optional Info = parseFMVExtension(Feature)) - Priority |= (1ULL << Info->PriorityBit); - return Priority; + // Transitively enable the Arch Extensions which correspond to each feature. + ExtensionSet FeatureBits; + for (const StringRef Feature : Features) { + std::optional FMV = parseFMVExtension(Feature); + if (!FMV) { + if (std::optional Info = targetFeatureToExtension(Feature)) + FMV = lookupFMVByID(Info->ID); + } + if (FMV && FMV->ID) + FeatureBits.enable(*FMV->ID); + } + + // Construct a bitmask for all the transitively enabled Arch Extensions. + uint64_t PriorityMask = 0; + for (const FMVInfo &Info : getFMVInfo()) + if (Info.ID && FeatureBits.Enabled.test(*Info.ID)) + PriorityMask |= (1ULL << Info.PriorityBit); + + return PriorityMask; } uint64_t AArch64::getCpuSupportsMask(ArrayRef Features) { diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 78cd249c9c16a..bf0cacc6224be 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2641,6 +2641,165 @@ DeleteDeadIFuncs(Module &M, return Changed; } +// Follows the use-def chain of \p V backwards until it finds a Function, +// in which case it collects in \p Versions. Return true on successful +// use-def chain traversal, false otherwise. +static bool collectVersions(TargetTransformInfo &TTI, Value *V, + SmallVectorImpl &Versions) { + if (auto *F = dyn_cast(V)) { + if (!TTI.isMultiversionedFunction(*F)) + return false; + Versions.push_back(F); + } else if (auto *Sel = dyn_cast(V)) { + if (!collectVersions(TTI, Sel->getTrueValue(), Versions)) + return false; + if (!collectVersions(TTI, Sel->getFalseValue(), Versions)) + return false; + } else if (auto *Phi = dyn_cast(V)) { + for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) + if (!collectVersions(TTI, Phi->getIncomingValue(I), Versions)) + return false; + } else { + // Unknown instruction type. Bail. + return false; + } + return true; +} + +// Bypass the IFunc Resolver of MultiVersioned functions when possible. To +// deduce whether the optimization is legal we need to compare the target +// features between caller and callee versions. The criteria for bypassing +// the resolver are the following: +// +// * If the callee's feature set is a subset of the caller's feature set, +// then the callee is a candidate for direct call. +// +// * Among such candidates the one of highest priority is the best match +// and it shall be picked, unless there is a version of the callee with +// higher priority than the best match which cannot be picked from a +// higher priority caller (directly or through the resolver). +// +// * For every higher priority callee version than the best match, there +// is a higher priority caller version whose feature set availability +// is implied by the callee's feature set. +// +static bool OptimizeNonTrivialIFuncs( + Module &M, function_ref GetTTI) { + bool Changed = false; + + // Cache containing the mask constructed from a function's target features. + DenseMap FeatureMask; + + for (GlobalIFunc &IF : M.ifuncs()) { + if (IF.isInterposable()) + continue; + + Function *Resolver = IF.getResolverFunction(); + if (!Resolver) + continue; + + if (Resolver->isInterposable()) + continue; + + TargetTransformInfo &TTI = GetTTI(*Resolver); + + // Discover the callee versions. + SmallVector Callees; + if (any_of(*Resolver, [&TTI, &Callees](BasicBlock &BB) { + if (auto *Ret = dyn_cast_or_null(BB.getTerminator())) + if (!collectVersions(TTI, Ret->getReturnValue(), Callees)) + return true; + return false; + })) + continue; + + assert(!Callees.empty() && "Expecting successful collection of versions"); + + // Cache the feature mask for each callee. + for (Function *Callee : Callees) { + auto [It, Inserted] = FeatureMask.try_emplace(Callee); + if (Inserted) + It->second = TTI.getFeatureMask(*Callee); + } + + // Sort the callee versions in decreasing priority order. + sort(Callees, [&](auto *LHS, auto *RHS) { + return FeatureMask[LHS] > FeatureMask[RHS]; + }); + + // Find the callsites and cache the feature mask for each caller. + SmallVector Callers; + DenseMap> CallSites; + for (User *U : IF.users()) { + if (auto *CB = dyn_cast(U)) { + if (CB->getCalledOperand() == &IF) { + Function *Caller = CB->getFunction(); + auto [FeatIt, FeatInserted] = FeatureMask.try_emplace(Caller); + if (FeatInserted) + FeatIt->second = TTI.getFeatureMask(*Caller); + auto [CallIt, CallInserted] = CallSites.try_emplace(Caller); + if (CallInserted) + Callers.push_back(Caller); + CallIt->second.push_back(CB); + } + } + } + + // Sort the caller versions in decreasing priority order. + sort(Callers, [&](auto *LHS, auto *RHS) { + return FeatureMask[LHS] > FeatureMask[RHS]; + }); + + auto implies = [](uint64_t A, uint64_t B) { return (A & B) == B; }; + + // Index to the highest priority candidate. + unsigned I = 0; + // Now try to redirect calls starting from higher priority callers. + for (Function *Caller : Callers) { + assert(I < Callees.size() && "Found callers of equal priority"); + + Function *Callee = Callees[I]; + uint64_t CallerBits = FeatureMask[Caller]; + uint64_t CalleeBits = FeatureMask[Callee]; + + // In the case of FMV callers, we know that all higher priority callers + // than the current one did not get selected at runtime, which helps + // reason about the callees (if they have versions that mandate presence + // of the features which we already know are unavailable on this target). + if (TTI.isMultiversionedFunction(*Caller)) { + // If the feature set of the caller implies the feature set of the + // highest priority candidate then it shall be picked. In case of + // identical sets advance the candidate index one position. + if (CallerBits == CalleeBits) + ++I; + else if (!implies(CallerBits, CalleeBits)) { + // Keep advancing the candidate index as long as the caller's + // features are a subset of the current candidate's. + while (implies(CalleeBits, CallerBits)) { + if (++I == Callees.size()) + break; + CalleeBits = FeatureMask[Callees[I]]; + } + continue; + } + } else { + // We can't reason much about non-FMV callers. Just pick the highest + // priority callee if it matches, otherwise bail. + if (I > 0 || !implies(CallerBits, CalleeBits)) + continue; + } + auto &Calls = CallSites[Caller]; + for (CallBase *CS : Calls) + CS->setCalledOperand(Callee); + Changed = true; + } + if (IF.use_empty() || + all_of(IF.users(), [](User *U) { return isa(U); })) + NumIFuncsResolved++; + } + return Changed; +} + static bool optimizeGlobalsInModule(Module &M, const DataLayout &DL, function_ref GetTLI, @@ -2707,6 +2866,9 @@ optimizeGlobalsInModule(Module &M, const DataLayout &DL, // Optimize IFuncs whose callee's are statically known. LocalChange |= OptimizeStaticIFuncs(M); + // Optimize IFuncs based on the target features of the caller. + LocalChange |= OptimizeNonTrivialIFuncs(M, GetTTI); + // Remove any IFuncs that are now dead. LocalChange |= DeleteDeadIFuncs(M, NotDiscardableComdats); diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll new file mode 100644 index 0000000000000..90bd98a9b0d38 --- /dev/null +++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll @@ -0,0 +1,365 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names)" --version 4 +; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +$test_single_bb_resolver.resolver = comdat any +$test_multi_bb_resolver.resolver = comdat any +$test_caller_feats_not_implied.resolver = comdat any +$test_non_fmv_caller.resolver = comdat any +$test_priority.resolver = comdat any +$test_alternative_names.resolver = comdat any + +@__aarch64_cpu_features = external local_unnamed_addr global { i64 } + +@test_single_bb_resolver = weak_odr ifunc i32 (), ptr @test_single_bb_resolver.resolver +@test_multi_bb_resolver = weak_odr ifunc i32 (), ptr @test_multi_bb_resolver.resolver +@test_caller_feats_not_implied = weak_odr ifunc i32 (), ptr @test_caller_feats_not_implied.resolver +@test_non_fmv_caller = weak_odr ifunc i32 (), ptr @test_non_fmv_caller.resolver +@test_priority = weak_odr ifunc i32 (), ptr @test_priority.resolver +@test_alternative_names = weak_odr ifunc i32 (), ptr @test_alternative_names.resolver + +declare void @__init_cpu_features_resolver() local_unnamed_addr + +declare i32 @test_single_bb_resolver.default() #0 +declare i32 @test_single_bb_resolver._Msve() #1 +declare i32 @test_single_bb_resolver._Msve2() #2 + +define weak_odr ptr @test_single_bb_resolver.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @test_single_bb_resolver.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 68719476736 + %.not = icmp eq i64 %1, 0 + %2 = and i64 %0, 1073741824 + %.not3 = icmp eq i64 %2, 0 + %test_single_bb_resolver._Msve.test_single_bb_resolver.default = select i1 %.not3, ptr @test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve + %common.ret.op = select i1 %.not, ptr %test_single_bb_resolver._Msve.test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve2 + ret ptr %common.ret.op +} + +define i32 @caller1._Msve() #1 { +; CHECK-LABEL: define i32 @caller1._Msve( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve() +; +entry: + %call = tail call i32 @test_single_bb_resolver() + ret i32 %call +} + +define i32 @caller1._Msve2() #2 { +; CHECK-LABEL: define i32 @caller1._Msve2( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve2() +; +entry: + %call = tail call i32 @test_single_bb_resolver() + ret i32 %call +} + +define i32 @caller1.default() #0 { +; CHECK-LABEL: define i32 @caller1.default( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver.default() +; +entry: + %call = tail call i32 @test_single_bb_resolver() + ret i32 %call +} + +declare i32 @test_multi_bb_resolver._Mmops() #3 +declare i32 @test_multi_bb_resolver._Msve2() #2 +declare i32 @test_multi_bb_resolver._Msve() #1 +declare i32 @test_multi_bb_resolver.default() #0 + +define weak_odr ptr @test_multi_bb_resolver.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @test_multi_bb_resolver.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460752303423488 + %.not = icmp eq i64 %1, 0 + br i1 %.not, label %resolver_else, label %common.ret + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @test_multi_bb_resolver._Mmops, %resolver_entry ], [ @test_multi_bb_resolver._Msve2, %resolver_else ], [ %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %2 = and i64 %0, 68719476736 + %.not5 = icmp eq i64 %2, 0 + br i1 %.not5, label %resolver_else2, label %common.ret + +resolver_else2: ; preds = %resolver_else + %3 = and i64 %0, 1073741824 + %.not6 = icmp eq i64 %3, 0 + %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default = select i1 %.not6, ptr @test_multi_bb_resolver.default, ptr @test_multi_bb_resolver._Msve + br label %common.ret +} + +define i32 @caller2._MmopsMsve2() #4 { +; CHECK-LABEL: define i32 @caller2._MmopsMsve2( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR4:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + ret i32 %call +} + +define i32 @caller2._Mmops() #3 { +; CHECK-LABEL: define i32 @caller2._Mmops( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR3:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + ret i32 %call +} + +define i32 @caller2._Msve() #1 { +; CHECK-LABEL: define i32 @caller2._Msve( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + ret i32 %call +} + +define i32 @caller2.default() #0 { +; CHECK-LABEL: define i32 @caller2.default( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver.default() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + ret i32 %call +} + +declare i32 @test_caller_feats_not_implied._Mmops() #3 +declare i32 @test_caller_feats_not_implied._Msme() #5 +declare i32 @test_caller_feats_not_implied._Msve() #1 +declare i32 @test_caller_feats_not_implied.default() #0 + +define weak_odr ptr @test_caller_feats_not_implied.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @test_caller_feats_not_implied.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460752303423488 + %.not = icmp eq i64 %1, 0 + br i1 %.not, label %resolver_else, label %common.ret + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @test_caller_feats_not_implied._Mmops, %resolver_entry ], [ @test_caller_feats_not_implied._Msme, %resolver_else ], [ %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %2 = and i64 %0, 4398046511104 + %.not5 = icmp eq i64 %2, 0 + br i1 %.not5, label %resolver_else2, label %common.ret + +resolver_else2: ; preds = %resolver_else + %3 = and i64 %0, 1073741824 + %.not6 = icmp eq i64 %3, 0 + %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default = select i1 %.not6, ptr @test_caller_feats_not_implied.default, ptr @test_caller_feats_not_implied._Msve + br label %common.ret +} + +define i32 @caller3._Mmops() #3 { +; CHECK-LABEL: define i32 @caller3._Mmops( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR3]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied._Mmops() +; +entry: + %call = tail call i32 @test_caller_feats_not_implied() + ret i32 %call +} + +define i32 @caller3._Msve() #1 { +; CHECK-LABEL: define i32 @caller3._Msve( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied() +; +entry: + %call = tail call i32 @test_caller_feats_not_implied() + ret i32 %call +} + +define i32 @caller3.default() #0 { +; CHECK-LABEL: define i32 @caller3.default( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied() +; +entry: + %call = tail call i32 @test_caller_feats_not_implied() + ret i32 %call +} + +declare i32 @test_non_fmv_caller._Maes() #6 +declare i32 @test_non_fmv_caller._Msm4() #7 +declare i32 @test_non_fmv_caller.default() #0 + +define weak_odr ptr @test_non_fmv_caller.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @test_non_fmv_caller.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 32768 + %.not = icmp eq i64 %1, 0 + %test_non_fmv_caller._Maes.test_non_fmv_caller.default = select i1 %.not, ptr @test_non_fmv_caller.default, ptr @test_non_fmv_caller._Maes + ret ptr %test_non_fmv_caller._Maes.test_non_fmv_caller.default +} + +define i32 @caller4() #8 { +; CHECK-LABEL: define i32 @caller4( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR7:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_non_fmv_caller._Maes() +; +entry: + %call = tail call i32 @test_non_fmv_caller() + ret i32 %call +} + +define i32 @caller5() #9 { +; CHECK-LABEL: define i32 @caller5( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR8:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_non_fmv_caller() +; +entry: + %call = tail call i32 @test_non_fmv_caller() + ret i32 %call +} + +declare i32 @test_priority._Msve2-sha3() #10 +declare i32 @test_priority._Mls64Mssbs() #11 +declare i32 @test_priority._MflagmMlseMrng() #12 +declare i32 @test_priority.default() #0 + +define weak_odr ptr @test_priority.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @test_priority.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 131 + %2 = icmp eq i64 %1, 131 + br i1 %2, label %common.ret, label %resolver_else + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @test_priority._MflagmMlseMrng, %resolver_entry ], [ @test_priority._Mls64Mssbs, %resolver_else ], [ %test_priority._Msve2-sha3.test_priority.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %3 = and i64 %0, 9570149208162304 + %4 = icmp eq i64 %3, 9570149208162304 + br i1 %4, label %common.ret, label %resolver_else2 + +resolver_else2: ; preds = %resolver_else + %5 = and i64 %0, 1099511627776 + %.not = icmp eq i64 %5, 0 + %test_priority._Msve2-sha3.test_priority.default = select i1 %.not, ptr @test_priority.default, ptr @test_priority._Msve2-sha3 + br label %common.ret +} + +define i32 @caller6._MflagmMls64MlseMrngMssbsMsve2-sha3() #13 { +; CHECK-LABEL: define i32 @caller6._MflagmMls64MlseMrngMssbsMsve2-sha3( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR12:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_priority._Mls64Mssbs() +; +entry: + %call = tail call i32 @test_priority() + ret i32 %call +} + +declare i32 @test_alternative_names._Mdpb2Mfrintts() #14 +declare i32 @test_alternative_names._Mflagm2Mfrintts() #15 +declare i32 @test_alternative_names._Mrcpc2() #16 +declare i32 @test_alternative_names.default() #0 + +define weak_odr ptr @test_alternative_names.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @test_alternative_names.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 17563904 + %2 = icmp eq i64 %1, 17563904 + br i1 %2, label %common.ret, label %resolver_else + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @test_alternative_names._Mdpb2Mfrintts, %resolver_entry ], [ @test_alternative_names._Mflagm2Mfrintts, %resolver_else ], [ %test_alternative_names._Mrcpc2.test_alternative_names.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %3 = and i64 %0, 16777478 + %4 = icmp eq i64 %3, 16777478 + br i1 %4, label %common.ret, label %resolver_else2 + +resolver_else2: ; preds = %resolver_else + %5 = and i64 %0, 12582912 + %6 = icmp eq i64 %5, 12582912 + %test_alternative_names._Mrcpc2.test_alternative_names.default = select i1 %6, ptr @test_alternative_names._Mrcpc2, ptr @test_alternative_names.default + br label %common.ret +} + +define i32 @caller7._Mdpb2Mfrintts() #14 { +; CHECK-LABEL: define i32 @caller7._Mdpb2Mfrintts( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR13:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names._Mdpb2Mfrintts() +; +entry: + %call = tail call i32 @test_alternative_names() + ret i32 %call +} + +define i32 @caller7._Mfrintts() #17 { +; CHECK-LABEL: define i32 @caller7._Mfrintts( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR16:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names() +; +entry: + %call = tail call i32 @test_alternative_names() + ret i32 %call +} + +define i32 @caller7._Mrcpc2() #16 { +; CHECK-LABEL: define i32 @caller7._Mrcpc2( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR15:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names._Mrcpc2() +; +entry: + %call = tail call i32 @test_alternative_names() + ret i32 %call +} + +define i32 @caller7.default() #0 { +; CHECK-LABEL: define i32 @caller7.default( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names.default() +; +entry: + %call = tail call i32 @test_alternative_names() + ret i32 %call +} + +attributes #0 = { "fmv-features" } +attributes #1 = { "fmv-features"="sve" } +attributes #2 = { "fmv-features"="sve2" } +attributes #3 = { "fmv-features"="mops" } +attributes #4 = { "fmv-features"="mops,sve2" } +attributes #5 = { "fmv-features"="sme" } +attributes #6 = { "fmv-features"="aes" } +attributes #7 = { "fmv-features"="sm4" } +attributes #8 = { "target-features"="+aes,+fp-armv8,+neon,+outline-atomics,+v8a" } +attributes #9 = { "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,+sm4" } +attributes #10 = { "fmv-features"="sve2-sha3" } +attributes #11 = { "fmv-features"="ls64,ssbs" } +attributes #12 = { "fmv-features"="flagm,lse,rng" } +attributes #13 = { "fmv-features"="flagm,ls64,lse,rng,ssbs,sve2-sha3" } +attributes #14 = { "fmv-features"="dpb2,frintts" } +attributes #15 = { "fmv-features"="flagm2,frintts" } +attributes #16 = { "fmv-features"="rcpc2" } +attributes #17 = { "fmv-features"="frintts" } From ad282f4c1fdcb6e03914d9dab4f85fad5b16e864 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 17 Jan 2025 10:50:05 +0000 Subject: [PATCH 26/45] [X86] Rename combineScalarToVector to combineSCALAR_TO_VECTOR. NFC. Match the file style of using the ISD NodeType name for the combine/lower method name. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6d69665c17565..de5bb08ae3a39 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -58572,8 +58572,8 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); SDLoc DL(N); @@ -59266,7 +59266,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, // clang-format off default: break; case ISD::SCALAR_TO_VECTOR: - return combineScalarToVector(N, DAG, Subtarget); + return combineSCALAR_TO_VECTOR(N, DAG, Subtarget); case ISD::EXTRACT_VECTOR_ELT: case X86ISD::PEXTRW: case X86ISD::PEXTRB: From 0ab368c5735328298d99dcfb80da12e7be028583 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Fri, 17 Jan 2025 10:54:39 +0000 Subject: [PATCH 27/45] SCEV/test: cover implied-via-addition (#123082) Since cf2e828 (SCEV: regen some tests with UTC) had the side-effect of moving an implied-via-addition test into IndVarSimplify, implication via addition is no longer covered in the SCEV tests. Fix this by writing fresh tests and checking backedge-taken output from SCEV. --- .../ScalarEvolution/implied-via-addition.ll | 111 ++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll diff --git a/llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll b/llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll new file mode 100644 index 0000000000000..7ab6221d0da53 --- /dev/null +++ b/llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll @@ -0,0 +1,111 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -disable-output -passes="print" \ +; RUN: -scalar-evolution-classify-expressions=0 2>&1 | FileCheck %s + +define void @implied1(i32 %n) { +; Prove that (n s> 1) ===> (n - 1 s> 0). +; CHECK-LABEL: 'implied1' +; CHECK-NEXT: Determining loop execution counts for: @implied1 +; CHECK-NEXT: Loop %header: backedge-taken count is (-2 + %n) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 2147483645 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-2 + %n) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; +entry: + %cmp1 = icmp sgt i32 %n, 1 + %n.minus.1 = sub nsw i32 %n, 1 + call void @llvm.assume(i1 %cmp1) + br label %header + +header: + %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ] + %indvar.next = add i32 %indvar, 1 + %exitcond = icmp sgt i32 %n.minus.1, %indvar.next + br i1 %exitcond, label %header, label %exit + +exit: + ret void +} + +define void @implied1_neg(i32 %n) { +; Prove that (n s> 0) =\=> (n - 1 s> 0). +; CHECK-LABEL: 'implied1_neg' +; CHECK-NEXT: Determining loop execution counts for: @implied1_neg +; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + (1 smax (-1 + %n))) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 2147483645 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + (1 smax (-1 + %n))) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; +entry: + %cmp1 = icmp sgt i32 %n, 0 + %n.minus.1 = sub nsw i32 %n, 1 + call void @llvm.assume(i1 %cmp1) + br label %header + +header: + %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ] + %indvar.next = add i32 %indvar, 1 + %exitcond = icmp sgt i32 %n.minus.1, %indvar.next + br i1 %exitcond, label %header, label %exit + +exit: + ret void +} + +define void @implied2(i32 %n) { +; Prove that (n u>= -1) ===> (n + 1 u>= 0). +; CHECK-LABEL: 'implied2' +; CHECK-NEXT: Determining loop execution counts for: @implied2 +; CHECK-NEXT: Loop %header: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %header: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %header: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: Loop %header: Predicated backedge-taken count is (1 + (zext i32 %n to i64)) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {1,+,1}<%header> Added Flags: +; CHECK-NEXT: Loop %header: Predicated constant max backedge-taken count is i64 4294967296 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {1,+,1}<%header> Added Flags: +; CHECK-NEXT: Loop %header: Predicated symbolic max backedge-taken count is (1 + (zext i32 %n to i64)) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {1,+,1}<%header> Added Flags: +; +entry: + %cmp1 = icmp uge i32 %n, -1 + %n.1 = add nuw i32 %n, 1 + call void @llvm.assume(i1 %cmp1) + br label %header + +header: + %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ] + %indvar.next = add i32 %indvar, 1 + %exitcond = icmp uge i32 %n.1, %indvar.next + br i1 %exitcond, label %header, label %exit + +exit: + ret void +} + +define void @implied2_neg(i32 %n) { +; Prove that (n u>= -1) =\=> (n - 1 s>= 0). +; CHECK-LABEL: 'implied2_neg' +; CHECK-NEXT: Determining loop execution counts for: @implied2_neg +; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + (1 smax %n)) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 2147483646 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + (1 smax %n)) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; +entry: + %cmp1 = icmp uge i32 %n, -1 + %n.minus.1 = sub nuw nsw i32 %n, 1 + call void @llvm.assume(i1 %cmp1) + br label %header + +header: + %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ] + %indvar.next = add i32 %indvar, 1 + %exitcond = icmp sge i32 %n.minus.1, %indvar.next + br i1 %exitcond, label %header, label %exit + +exit: + ret void +} From 437834e16be6d04e7b198dad8a42d507770251a1 Mon Sep 17 00:00:00 2001 From: Kiran Chandramohan Date: Fri, 17 Jan 2025 10:55:28 +0000 Subject: [PATCH 28/45] [Flang] Use a module directory to avoid race condition (#123215) Use a module directory in a test that uses another fortran test to avoid race conditions in module creation. --- flang/test/Lower/module_use.f90 | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/flang/test/Lower/module_use.f90 b/flang/test/Lower/module_use.f90 index ad43865470b68..b976663239ef5 100644 --- a/flang/test/Lower/module_use.f90 +++ b/flang/test/Lower/module_use.f90 @@ -1,5 +1,6 @@ -! RUN: bbc -emit-fir %S/module_definition.f90 -! RUN: bbc -emit-fir %s -o - | FileCheck %s +! RUN: rm -fr %t && mkdir -p %t +! RUN: bbc -emit-fir -module %t %S/module_definition.f90 +! RUN: bbc -emit-fir -J %t %s -o - | FileCheck %s ! Test use of module data not defined in this file. ! The modules are defined in module_definition.f90 From 21704a685de5f241acddf462e5f9b38d132cfcaa Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Fri, 17 Jan 2025 03:00:02 -0800 Subject: [PATCH 29/45] [AMDGPU] Fix printing hasInitWholeWave in mir (#123232) --- .../lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 2 +- llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 169f1369fb543..7de64bddf7884 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -715,7 +715,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()), MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()), - Mode(MFI.getMode()) { + Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()) { for (Register Reg : MFI.getSGPRSpillPhysVGPRs()) SpillPhysVGPRS.push_back(regToString(Reg, TRI)); diff --git a/llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll b/llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll new file mode 100644 index 0000000000000..f3b8deff61918 --- /dev/null +++ b/llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll @@ -0,0 +1,17 @@ +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -stop-after=finalize-isel < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -stop-after=finalize-isel < %s | FileCheck --check-prefix=GCN %s + +; GCN-LABEL: name: init_wwm +; GCN: hasInitWholeWave: true +define void @init_wwm(ptr addrspace(1) inreg %p) { +entry: + %entry_exec = call i1 @llvm.amdgcn.init.whole.wave() + br i1 %entry_exec, label %bb.1, label %bb.2 + +bb.1: + store i32 1, ptr addrspace(1) %p + br label %bb.2 + +bb.2: + ret void +} From 0d7c8c0e294d23fcfc9a396dafebe1465c471035 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Fri, 17 Jan 2025 12:07:52 +0100 Subject: [PATCH 30/45] [bazel] Add new file added in 437834e16be6d04e7b198dad8a42d507770251a1 --- utils/bazel/llvm-project-overlay/clang/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index f72babb646a85..2286d4cd35e08 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -700,6 +700,7 @@ cc_library( includes = ["include"], textual_hdrs = [ # keep sorted + "include/clang/Basic/AllDiagnosticKinds.inc", "include/clang/Basic/AttrHasAttributeImpl.inc", "include/clang/Basic/AttrList.inc", "include/clang/Basic/AttrSubMatchRulesList.inc", From f66a5e220cbc2650a5843db854d0734d2aaa030f Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Fri, 17 Jan 2025 12:13:30 +0100 Subject: [PATCH 31/45] [lldb] Fix SBThread::StepOverUntil for discontinuous functions (#123046) I think the only issue here was that we would erroneously consider functions which are "in the middle" of the function were stepping to as a part of the function, and would try to step into them (likely stepping out of the function instead) instead of giving up early. --- lldb/include/lldb/Symbol/Function.h | 5 + lldb/source/API/SBThread.cpp | 5 +- .../thread/step_until/TestStepUntilAPI.py | 133 ++++++++++++++++++ .../thread/step_until/function.list | 1 + .../functionalities/thread/step_until/main.c | 3 + .../thread/step_until/symbol.order | 9 ++ 6 files changed, 154 insertions(+), 2 deletions(-) create mode 100644 lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py create mode 100644 lldb/test/API/functionalities/thread/step_until/function.list create mode 100644 lldb/test/API/functionalities/thread/step_until/symbol.order diff --git a/lldb/include/lldb/Symbol/Function.h b/lldb/include/lldb/Symbol/Function.h index 157c007bdf0e8..d0b27269568b0 100644 --- a/lldb/include/lldb/Symbol/Function.h +++ b/lldb/include/lldb/Symbol/Function.h @@ -454,6 +454,11 @@ class Function : public UserID, public SymbolContextScope { /// and variables). const Address &GetAddress() const { return m_address; } + bool GetRangeContainingLoadAddress(lldb::addr_t load_addr, Target &target, + AddressRange &range) { + return m_block.GetRangeContainingLoadAddress(load_addr, target, range); + } + lldb::LanguageType GetLanguage() const; /// Find the file and line number of the source location of the start of the /// function. This will use the declaration if present and fall back on the diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp index 4e61c83889b0b..cc848076dab5f 100644 --- a/lldb/source/API/SBThread.cpp +++ b/lldb/source/API/SBThread.cpp @@ -842,7 +842,6 @@ SBError SBThread::StepOverUntil(lldb::SBFrame &sb_frame, // appropriate error message. bool all_in_function = true; - AddressRange fun_range = frame_sc.function->GetAddressRange(); std::vector step_over_until_addrs; const bool abort_other_plans = false; @@ -859,7 +858,9 @@ SBError SBThread::StepOverUntil(lldb::SBFrame &sb_frame, addr_t step_addr = sc.line_entry.range.GetBaseAddress().GetLoadAddress(target); if (step_addr != LLDB_INVALID_ADDRESS) { - if (fun_range.ContainsLoadAddress(step_addr, target)) + AddressRange unused_range; + if (frame_sc.function->GetRangeContainingLoadAddress(step_addr, *target, + unused_range)) step_over_until_addrs.push_back(step_addr); else all_in_function = false; diff --git a/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py b/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py new file mode 100644 index 0000000000000..de3892ed278f8 --- /dev/null +++ b/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py @@ -0,0 +1,133 @@ +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class TestStepUntilAPI(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + def setUp(self): + super().setUp() + + self.main_source = "main.c" + self.main_spec = lldb.SBFileSpec(self.main_source) + self.less_than_two = line_number("main.c", "Less than 2") + self.greater_than_two = line_number("main.c", "Greater than or equal to 2.") + self.back_out_in_main = line_number("main.c", "Back out in main") + self.in_foo = line_number("main.c", "In foo") + + def _build_dict_for_discontinuity(self): + return dict( + CFLAGS_EXTRAS="-funique-basic-block-section-names " + + "-ffunction-sections -fbasic-block-sections=list=" + + self.getSourcePath("function.list"), + LD_EXTRAS="-Wl,--script=" + self.getSourcePath("symbol.order"), + ) + + def _do_until(self, build_dict, args, until_line, expected_line): + self.build(dictionary=build_dict) + launch_info = lldb.SBLaunchInfo(args) + _, _, thread, _ = lldbutil.run_to_source_breakpoint( + self, "At the start", self.main_spec, launch_info + ) + + self.assertSuccess( + thread.StepOverUntil(self.frame(), self.main_spec, until_line) + ) + + self.runCmd("process status") + + line = self.frame().GetLineEntry().GetLine() + self.assertEqual( + line, expected_line, "Did not get the expected stop line number" + ) + + def _assertDiscontinuity(self): + target = self.target() + foo = target.FindFunctions("foo") + self.assertEqual(len(foo), 1) + foo = foo[0] + + call_me = self.target().FindFunctions("call_me") + self.assertEqual(len(call_me), 1) + call_me = call_me[0] + + foo_addr = foo.function.GetStartAddress().GetLoadAddress(target) + found_before = False + found_after = False + for range in call_me.function.GetRanges(): + addr = range.GetBaseAddress().GetLoadAddress(target) + if addr < foo_addr: + found_before = True + if addr > foo_addr: + found_after = True + + self.assertTrue( + found_before and found_after, + "'foo' is not between 'call_me'" + str(foo) + str(call_me), + ) + + def test_hitting(self): + """Test SBThread.StepOverUntil - targeting a line and hitting it.""" + self._do_until(None, None, self.less_than_two, self.less_than_two) + + @skipIf(oslist=lldbplatformutil.getDarwinOSTriples() + ["windows"]) + def test_hitting_discontinuous(self): + """Test SBThread.StepOverUntil - targeting a line and hitting it -- with + discontinuous functions""" + self._do_until( + self._build_dict_for_discontinuity(), + None, + self.less_than_two, + self.less_than_two, + ) + self._assertDiscontinuity() + + def test_missing(self): + """Test SBThread.StepOverUntil - targeting a line and missing it by stepping out to call site""" + self._do_until( + None, ["foo", "bar", "baz"], self.less_than_two, self.back_out_in_main + ) + + @skipIf(oslist=lldbplatformutil.getDarwinOSTriples() + ["windows"]) + def test_missing_discontinuous(self): + """Test SBThread.StepOverUntil - targeting a line and missing it by + stepping out to call site -- with discontinuous functions""" + self._do_until( + self._build_dict_for_discontinuity(), + ["foo", "bar", "baz"], + self.less_than_two, + self.back_out_in_main, + ) + self._assertDiscontinuity() + + def test_bad_line(self): + """Test that we get an error if attempting to step outside the current + function""" + self.build() + _, _, thread, _ = lldbutil.run_to_source_breakpoint( + self, "At the start", self.main_spec + ) + self.assertIn( + "step until target not in current function", + thread.StepOverUntil( + self.frame(), self.main_spec, self.in_foo + ).GetCString(), + ) + + @skipIf(oslist=lldbplatformutil.getDarwinOSTriples() + ["windows"]) + def test_bad_line_discontinuous(self): + """Test that we get an error if attempting to step outside the current + function -- and the function is discontinuous""" + self.build(dictionary=self._build_dict_for_discontinuity()) + _, _, thread, _ = lldbutil.run_to_source_breakpoint( + self, "At the start", self.main_spec + ) + self.assertIn( + "step until target not in current function", + thread.StepOverUntil( + self.frame(), self.main_spec, self.in_foo + ).GetCString(), + ) + self._assertDiscontinuity() diff --git a/lldb/test/API/functionalities/thread/step_until/function.list b/lldb/test/API/functionalities/thread/step_until/function.list new file mode 100644 index 0000000000000..5900fe8c35069 --- /dev/null +++ b/lldb/test/API/functionalities/thread/step_until/function.list @@ -0,0 +1 @@ +!call_me diff --git a/lldb/test/API/functionalities/thread/step_until/main.c b/lldb/test/API/functionalities/thread/step_until/main.c index bb866079cf5f5..4c52308f030e9 100644 --- a/lldb/test/API/functionalities/thread/step_until/main.c +++ b/lldb/test/API/functionalities/thread/step_until/main.c @@ -4,6 +4,9 @@ * unrelated to the program, just to achieve consistent * debug line tables, across platforms, that are not * dependent on compiler optimzations. */ + +int foo(int x) { return x; /* In foo */ } + int call_me(int argc) { printf ("At the start, argc: %d.\n", argc); diff --git a/lldb/test/API/functionalities/thread/step_until/symbol.order b/lldb/test/API/functionalities/thread/step_until/symbol.order new file mode 100644 index 0000000000000..dcc9607a4188f --- /dev/null +++ b/lldb/test/API/functionalities/thread/step_until/symbol.order @@ -0,0 +1,9 @@ +SECTIONS { + .text.ordered : { + *(.text.call_me) + *(.text.foo) + *(.text.call_me.call_me.__part.1) + *(.text.call_me.call_me.__part.2) + *(.text.call_me.call_me.__part.3) + } +} INSERT BEFORE .text; From a8649067723a84d1b9320523aa63f639f7bf5dfa Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 17 Jan 2025 11:55:22 +0000 Subject: [PATCH 32/45] [X86] Fix logical operator warnings. NFC. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index de5bb08ae3a39..dba38f3e1a0bc 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -26438,7 +26438,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, switch (CC) { case ISD::SETEQ: { SetCC = getSETCC(X86::COND_E, Comi, dl, DAG); - if (HasAVX10_2_COMX & HasAVX10_2_COMX_Ty) // ZF == 1 + if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1 break; // (ZF = 1 and PF = 0) SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG); @@ -26447,7 +26447,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } case ISD::SETNE: { SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG); - if (HasAVX10_2_COMX & HasAVX10_2_COMX_Ty) // ZF == 0 + if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0 break; // (ZF = 0 or PF = 1) SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG); From 7075eee6bd0d445aa3f58ace314f7d12756c3e38 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Fri, 17 Jan 2025 12:58:15 +0100 Subject: [PATCH 33/45] [clang][bytecode] Add InitLinkScope for toplevel Expr temporary (#123319) --- clang/lib/AST/ByteCode/Compiler.cpp | 1 + clang/test/AST/ByteCode/cxx20.cpp | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 4bfb80589620c..6677119d09211 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -4247,6 +4247,7 @@ bool Compiler::visitExpr(const Expr *E, bool DestroyToplevelScope) { // For us, that means everything we don't // have a PrimType for. if (std::optional LocalOffset = this->allocateLocal(E)) { + InitLinkScope ILS(this, InitLink::Temp(*LocalOffset)); if (!this->emitGetPtrLocal(*LocalOffset, E)) return false; diff --git a/clang/test/AST/ByteCode/cxx20.cpp b/clang/test/AST/ByteCode/cxx20.cpp index 268362ceff635..268226a7c143e 100644 --- a/clang/test/AST/ByteCode/cxx20.cpp +++ b/clang/test/AST/ByteCode/cxx20.cpp @@ -893,3 +893,18 @@ namespace VirtDtor { static_assert(test('C', 'B')); } + +namespace TemporaryInNTTP { + template struct B { /* ... */ }; + struct J1 { + J1 *self=this; + }; + /// FIXME: The bytecode interpreter emits a different diagnostic here. + /// The current interpreter creates a fake MaterializeTemporaryExpr (see EvaluateAsConstantExpr) + /// which is later used as the LValueBase of the created APValue. + B j1; // ref-error {{pointer to temporary object is not allowed in a template argument}} \ + // expected-error {{non-type template argument is not a constant expression}} \ + // expected-note {{pointer to temporary is not a constant expression}} \ + // expected-note {{created here}} + B<2> j2; /// Ok. +} From 61f94ebc9ef39a47f393a0dca58335e39d961b07 Mon Sep 17 00:00:00 2001 From: Jan Patrick Lehr Date: Fri, 17 Jan 2025 13:01:25 +0100 Subject: [PATCH 34/45] [NFC][Offload] Structure/Readability of CMake cache (#123328) Preparing to add more config options and want to group them all from most-common to project / component specific. --- offload/cmake/caches/AMDGPUBot.cmake | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/offload/cmake/caches/AMDGPUBot.cmake b/offload/cmake/caches/AMDGPUBot.cmake index d72b620ae3080..69bef91b2ce49 100644 --- a/offload/cmake/caches/AMDGPUBot.cmake +++ b/offload/cmake/caches/AMDGPUBot.cmake @@ -1,17 +1,19 @@ -# This file is meant for test builds on one basic AMDGPU buildbot only. +# This file is used across all AMDGPU-cmake builders # Install directory set to /tmp as this is a bot config set(CMAKE_INSTALL_PREFIX /tmp/llvm.install.test CACHE STRING "") +# General settings +set(CMAKE_BUILD_TYPE Release CACHE STRING "") +set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "") +set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "") + set(LLVM_ENABLE_PROJECTS "clang;lld" CACHE STRING "") set(LLVM_ENABLE_RUNTIMES "compiler-rt;openmp;offload" CACHE STRING "") + set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "") set(LLVM_ENABLE_ASSERTIONS ON CACHE BOOL "") -set(LLVM_LIT_ARGS "-v --show-unsupported --timeout 100 --show-xfail -j 32" CACHE STRING "") set(LLVM_TARGETS_TO_BUILD "host;AMDGPU" CACHE STRING "") +set(LLVM_LIT_ARGS "-v --show-unsupported --timeout 100 --show-xfail -j 32" CACHE STRING "") set(CLANG_DEFAULT_LINKER "lld" CACHE STRING "") - -set(CMAKE_BUILD_TYPE Release CACHE STRING "") -set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "") -set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "") From 58fc8029e91bf56811444d4a37a8f517a43bdc11 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Fri, 17 Jan 2025 12:45:14 +0100 Subject: [PATCH 35/45] [lldb] Skip TestStepUntilAPI on !x86_64, !aarch64 The compiler does not support this feature on other architectures. --- .../API/functionalities/thread/step_until/TestStepUntilAPI.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py b/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py index de3892ed278f8..59e028acf014c 100644 --- a/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py +++ b/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py @@ -73,6 +73,7 @@ def test_hitting(self): self._do_until(None, None, self.less_than_two, self.less_than_two) @skipIf(oslist=lldbplatformutil.getDarwinOSTriples() + ["windows"]) + @skipIf(archs=no_match(["x86_64", "aarch64"])) def test_hitting_discontinuous(self): """Test SBThread.StepOverUntil - targeting a line and hitting it -- with discontinuous functions""" @@ -91,6 +92,7 @@ def test_missing(self): ) @skipIf(oslist=lldbplatformutil.getDarwinOSTriples() + ["windows"]) + @skipIf(archs=no_match(["x86_64", "aarch64"])) def test_missing_discontinuous(self): """Test SBThread.StepOverUntil - targeting a line and missing it by stepping out to call site -- with discontinuous functions""" @@ -117,6 +119,7 @@ def test_bad_line(self): ) @skipIf(oslist=lldbplatformutil.getDarwinOSTriples() + ["windows"]) + @skipIf(archs=no_match(["x86_64", "aarch64"])) def test_bad_line_discontinuous(self): """Test that we get an error if attempting to step outside the current function -- and the function is discontinuous""" From a90b5b1885cc9587d7d65edbe3e0d94c4e2f4459 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Fri, 17 Jan 2025 12:11:53 +0000 Subject: [PATCH 36/45] [libclc] Move degrees/radians to CLC library & optimize (#123222) Missing half variants were also added. The builtins are now consistently emitted in vector form (i.e., with a splat of the literal to the appropriate vector size). --- libclc/clc/include/clc/common/clc_degrees.h | 12 +++++ libclc/clc/include/clc/common/clc_radians.h | 12 +++++ libclc/clc/lib/generic/SOURCES | 2 + libclc/clc/lib/generic/common/clc_degrees.cl | 56 ++++++++++++++++++++ libclc/clc/lib/generic/common/clc_radians.cl | 56 ++++++++++++++++++++ libclc/clc/lib/spirv/SOURCES | 2 + libclc/clc/lib/spirv64/SOURCES | 2 + libclc/generic/lib/common/degrees.cl | 21 ++++---- libclc/generic/lib/common/radians.cl | 21 ++++---- 9 files changed, 160 insertions(+), 24 deletions(-) create mode 100644 libclc/clc/include/clc/common/clc_degrees.h create mode 100644 libclc/clc/include/clc/common/clc_radians.h create mode 100644 libclc/clc/lib/generic/common/clc_degrees.cl create mode 100644 libclc/clc/lib/generic/common/clc_radians.cl diff --git a/libclc/clc/include/clc/common/clc_degrees.h b/libclc/clc/include/clc/common/clc_degrees.h new file mode 100644 index 0000000000000..e8bb684fcd4d7 --- /dev/null +++ b/libclc/clc/include/clc/common/clc_degrees.h @@ -0,0 +1,12 @@ +#ifndef __CLC_MATH_CLC_DEGREES_H__ +#define __CLC_MATH_CLC_DEGREES_H__ + +#define __CLC_BODY +#define __CLC_FUNCTION __clc_degrees + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION + +#endif // __CLC_MATH_CLC_DEGREES_H__ diff --git a/libclc/clc/include/clc/common/clc_radians.h b/libclc/clc/include/clc/common/clc_radians.h new file mode 100644 index 0000000000000..80d481e8de723 --- /dev/null +++ b/libclc/clc/include/clc/common/clc_radians.h @@ -0,0 +1,12 @@ +#ifndef __CLC_MATH_CLC_RADIANS_H__ +#define __CLC_MATH_CLC_RADIANS_H__ + +#define __CLC_BODY +#define __CLC_FUNCTION __clc_radians + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION + +#endif // __CLC_MATH_CLC_RADIANS_H__ diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES index f3097de694422..d74bff20ba87b 100644 --- a/libclc/clc/lib/generic/SOURCES +++ b/libclc/clc/lib/generic/SOURCES @@ -1,3 +1,5 @@ +common/clc_degrees.cl +common/clc_radians.cl common/clc_smoothstep.cl geometric/clc_dot.cl integer/clc_abs.cl diff --git a/libclc/clc/lib/generic/common/clc_degrees.cl b/libclc/clc/lib/generic/common/clc_degrees.cl new file mode 100644 index 0000000000000..ce705982072e8 --- /dev/null +++ b/libclc/clc/lib/generic/common/clc_degrees.cl @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include +#include + +#define DEGREES_SINGLE_DEF(TYPE, LITERAL) \ + _CLC_OVERLOAD _CLC_DEF TYPE __clc_degrees(TYPE radians) { \ + return (TYPE)LITERAL * radians; \ + } + +#define DEGREES_DEF(TYPE, LITERAL) \ + DEGREES_SINGLE_DEF(TYPE, LITERAL) \ + DEGREES_SINGLE_DEF(TYPE##2, LITERAL) \ + DEGREES_SINGLE_DEF(TYPE##3, LITERAL) \ + DEGREES_SINGLE_DEF(TYPE##4, LITERAL) \ + DEGREES_SINGLE_DEF(TYPE##8, LITERAL) \ + DEGREES_SINGLE_DEF(TYPE##16, LITERAL) + +// 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F +DEGREES_DEF(float, 0x1.ca5dc2p+5F) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +// 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F +DEGREES_DEF(double, 0x1.ca5dc1a63c1f8p+5) + +#endif + +#ifdef cl_khr_fp16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +// 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F +DEGREES_DEF(half, (half)0x1.ca5dc1a63c1f8p+5) + +#endif diff --git a/libclc/clc/lib/generic/common/clc_radians.cl b/libclc/clc/lib/generic/common/clc_radians.cl new file mode 100644 index 0000000000000..850b8eb84f9da --- /dev/null +++ b/libclc/clc/lib/generic/common/clc_radians.cl @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include +#include + +#define RADIANS_SINGLE_DEF(TYPE, LITERAL) \ + _CLC_OVERLOAD _CLC_DEF TYPE __clc_radians(TYPE radians) { \ + return (TYPE)LITERAL * radians; \ + } + +#define RADIANS_DEF(TYPE, LITERAL) \ + RADIANS_SINGLE_DEF(TYPE, LITERAL) \ + RADIANS_SINGLE_DEF(TYPE##2, LITERAL) \ + RADIANS_SINGLE_DEF(TYPE##3, LITERAL) \ + RADIANS_SINGLE_DEF(TYPE##4, LITERAL) \ + RADIANS_SINGLE_DEF(TYPE##8, LITERAL) \ + RADIANS_SINGLE_DEF(TYPE##16, LITERAL) + +// pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F +RADIANS_DEF(float, 0x1.1df46ap-6F) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +// pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F +RADIANS_DEF(double, 0x1.1df46a2529d39p-6) + +#endif + +#ifdef cl_khr_fp16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +// pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F +RADIANS_DEF(half, (half)0x1.1df46a2529d39p-6) + +#endif diff --git a/libclc/clc/lib/spirv/SOURCES b/libclc/clc/lib/spirv/SOURCES index 02784b8def682..ac855ea5184ed 100644 --- a/libclc/clc/lib/spirv/SOURCES +++ b/libclc/clc/lib/spirv/SOURCES @@ -1,3 +1,5 @@ +../generic/common/clc_degrees.cl +../generic/common/clc_radians.cl ../generic/common/clc_smoothstep.cl ../generic/geometric/clc_dot.cl ../generic/math/clc_ceil.cl diff --git a/libclc/clc/lib/spirv64/SOURCES b/libclc/clc/lib/spirv64/SOURCES index 02784b8def682..ac855ea5184ed 100644 --- a/libclc/clc/lib/spirv64/SOURCES +++ b/libclc/clc/lib/spirv64/SOURCES @@ -1,3 +1,5 @@ +../generic/common/clc_degrees.cl +../generic/common/clc_radians.cl ../generic/common/clc_smoothstep.cl ../generic/geometric/clc_dot.cl ../generic/math/clc_ceil.cl diff --git a/libclc/generic/lib/common/degrees.cl b/libclc/generic/lib/common/degrees.cl index cf49b190c76b3..a9715d64f507a 100644 --- a/libclc/generic/lib/common/degrees.cl +++ b/libclc/generic/lib/common/degrees.cl @@ -22,23 +22,20 @@ #include #include +#include -_CLC_OVERLOAD _CLC_DEF float degrees(float radians) { - // 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F - return 0x1.ca5dc2p+5F * radians; -} - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, degrees, float); - +_CLC_DEFINE_UNARY_BUILTIN(float, degrees, __clc_degrees, float) #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable -_CLC_OVERLOAD _CLC_DEF double degrees(double radians) { - // 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F - return 0x1.ca5dc1a63c1f8p+5 * radians; -} +_CLC_DEFINE_UNARY_BUILTIN(double, degrees, __clc_degrees, double) + +#endif + +#ifdef cl_khr_fp16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, degrees, double); +_CLC_DEFINE_UNARY_BUILTIN(half, degrees, __clc_degrees, half) #endif diff --git a/libclc/generic/lib/common/radians.cl b/libclc/generic/lib/common/radians.cl index 645a30549afed..b5dcbfe6e3fd2 100644 --- a/libclc/generic/lib/common/radians.cl +++ b/libclc/generic/lib/common/radians.cl @@ -22,23 +22,20 @@ #include #include +#include -_CLC_OVERLOAD _CLC_DEF float radians(float degrees) { - // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F - return 0x1.1df46ap-6F * degrees; -} - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, radians, float); - +_CLC_DEFINE_UNARY_BUILTIN(float, radians, __clc_radians, float) #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable -_CLC_OVERLOAD _CLC_DEF double radians(double degrees) { - // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F - return 0x1.1df46a2529d39p-6 * degrees; -} +_CLC_DEFINE_UNARY_BUILTIN(double, radians, __clc_radians, double) + +#endif + +#ifdef cl_khr_fp16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, radians, double); +_CLC_DEFINE_UNARY_BUILTIN(half, radians, __clc_radians, half) #endif From 8c63648117f1e1705943903b149f36ab8a4df1e5 Mon Sep 17 00:00:00 2001 From: Kiran Chandramohan Date: Fri, 17 Jan 2025 12:14:20 +0000 Subject: [PATCH 37/45] =?UTF-8?q?Revert=20"Revert=20"[Flang][Driver]=20Add?= =?UTF-8?q?=20a=20flag=20to=20control=20zero=20initializa=E2=80=A6=20(#123?= =?UTF-8?q?097)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …tion of global v…" (#123067)" This reverts commit 44ba43aa2b740878d83a9d6f1d52a333c0d48c22. Adds the flag to bbc as well. --- clang/include/clang/Driver/Options.td | 5 +++++ clang/lib/Driver/ToolChains/Flang.cpp | 6 +++-- flang/include/flang/Lower/LoweringOptions.def | 3 +++ flang/lib/Frontend/CompilerInvocation.cpp | 8 +++++++ flang/lib/Lower/ConvertVariable.cpp | 6 ++++- flang/test/Driver/fno-zero-init.f90 | 9 ++++++++ flang/test/Lower/zero_init.f90 | 20 +++++++++++++++++ flang/test/Lower/zero_init_default_init.f90 | 22 +++++++++++++++++++ flang/tools/bbc/bbc.cpp | 6 +++++ 9 files changed, 82 insertions(+), 3 deletions(-) create mode 100644 flang/test/Driver/fno-zero-init.f90 create mode 100644 flang/test/Lower/zero_init.f90 create mode 100644 flang/test/Lower/zero_init_default_init.f90 diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index d38dd2b4e3cf0..c4b9743597bb2 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3505,6 +3505,11 @@ def fno_struct_path_tbaa : Flag<["-"], "fno-struct-path-tbaa">, Group; def fno_strict_enums : Flag<["-"], "fno-strict-enums">, Group; def fno_strict_overflow : Flag<["-"], "fno-strict-overflow">, Group, Visibility<[ClangOption, FlangOption]>; +defm init_global_zero : BoolOptionWithoutMarshalling<"f", "init-global-zero", + PosFlag, + NegFlag>; def fno_pointer_tbaa : Flag<["-"], "fno-pointer-tbaa">, Group; def fno_temp_file : Flag<["-"], "fno-temp-file">, Group, Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>, HelpText< diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index 86ed25badfa2b..9c1fd28a3a8a2 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -155,8 +155,10 @@ void Flang::addCodegenOptions(const ArgList &Args, options::OPT_flang_deprecated_no_hlfir, options::OPT_fno_ppc_native_vec_elem_order, options::OPT_fppc_native_vec_elem_order, - options::OPT_ftime_report, options::OPT_ftime_report_EQ, - options::OPT_funroll_loops, options::OPT_fno_unroll_loops}); + options::OPT_finit_global_zero, + options::OPT_fno_init_global_zero, options::OPT_ftime_report, + options::OPT_ftime_report_EQ, options::OPT_funroll_loops, + options::OPT_fno_unroll_loops}); } void Flang::addPicOptions(const ArgList &Args, ArgStringList &CmdArgs) const { diff --git a/flang/include/flang/Lower/LoweringOptions.def b/flang/include/flang/Lower/LoweringOptions.def index 5a6debfdffe03..396c91948be36 100644 --- a/flang/include/flang/Lower/LoweringOptions.def +++ b/flang/include/flang/Lower/LoweringOptions.def @@ -44,5 +44,8 @@ ENUM_LOWERINGOPT(IntegerWrapAround, unsigned, 1, 0) /// If false, assume that the shapes/types/allocation-status match. ENUM_LOWERINGOPT(ReallocateLHS, unsigned, 1, 1) +/// If true, initialize globals without initialization to zero. +/// On by default. +ENUM_LOWERINGOPT(InitGlobalZero, unsigned, 1, 1) #undef LOWERINGOPT #undef ENUM_LOWERINGOPT diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 15b1e1e0a2488..3c6da4687f65d 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -1377,6 +1377,14 @@ bool CompilerInvocation::createFromArgs( invoc.loweringOpts.setNoPPCNativeVecElemOrder(true); } + // -f[no-]init-global-zero + if (args.hasFlag(clang::driver::options::OPT_finit_global_zero, + clang::driver::options::OPT_fno_init_global_zero, + /*default=*/true)) + invoc.loweringOpts.setInitGlobalZero(true); + else + invoc.loweringOpts.setInitGlobalZero(false); + // Preserve all the remark options requested, i.e. -Rpass, -Rpass-missed or // -Rpass-analysis. This will be used later when processing and outputting the // remarks generated by LLVM in ExecuteCompilerInvocation.cpp. diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index 9ee42d5cd8800..87236dc293ebb 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -635,7 +635,11 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter, global.setLinkName(builder.createCommonLinkage()); Fortran::lower::createGlobalInitialization( builder, global, [&](fir::FirOpBuilder &builder) { - mlir::Value initValue = builder.create(loc, symTy); + mlir::Value initValue; + if (converter.getLoweringOptions().getInitGlobalZero()) + initValue = builder.create(loc, symTy); + else + initValue = builder.create(loc, symTy); builder.create(loc, initValue); }); } diff --git a/flang/test/Driver/fno-zero-init.f90 b/flang/test/Driver/fno-zero-init.f90 new file mode 100644 index 0000000000000..2ffa10dd040d5 --- /dev/null +++ b/flang/test/Driver/fno-zero-init.f90 @@ -0,0 +1,9 @@ +! Check that the driver passes through -f[no-]init-global-zero: +! RUN: %flang -### -S -finit-global-zero %s -o - 2>&1 | FileCheck --check-prefix=CHECK-POS %s +! RUN: %flang -### -S -fno-init-global-zero %s -o - 2>&1 | FileCheck --check-prefix=CHECK-NEG %s +! Check that the compiler accepts -f[no-]init-global-zero: +! RUN: %flang_fc1 -emit-hlfir -finit-global-zero %s -o - +! RUN: %flang_fc1 -emit-hlfir -fno-init-global-zero %s -o - + +! CHECK-POS: "-fc1"{{.*}}"-finit-global-zero" +! CHECK-NEG: "-fc1"{{.*}}"-fno-init-global-zero" diff --git a/flang/test/Lower/zero_init.f90 b/flang/test/Lower/zero_init.f90 new file mode 100644 index 0000000000000..5ed6f2247de3b --- /dev/null +++ b/flang/test/Lower/zero_init.f90 @@ -0,0 +1,20 @@ +! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s +! RUN: %flang_fc1 -finit-global-zero -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s +! RUN: %flang_fc1 -fno-init-global-zero -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-NO-ZERO-INIT %s +! RUN: bbc -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s +! RUN: bbc -finit-global-zero -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s +! RUN: bbc -finit-global-zero=false -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-NO-ZERO-INIT %s + +module m1 + real :: x +end module m1 + +!CHECK-DEFAULT: fir.global @_QMm1Ex : f32 { +!CHECK-DEFAULT: %[[UNDEF:.*]] = fir.zero_bits f32 +!CHECK-DEFAULT: fir.has_value %[[UNDEF]] : f32 +!CHECK-DEFAULT: } + +!CHECK-NO-ZERO-INIT: fir.global @_QMm1Ex : f32 { +!CHECK-NO-ZERO-INIT: %[[UNDEF:.*]] = fir.undefined f32 +!CHECK-NO-ZERO-INIT: fir.has_value %[[UNDEF]] : f32 +!CHECK-NO-ZERO-INIT: } diff --git a/flang/test/Lower/zero_init_default_init.f90 b/flang/test/Lower/zero_init_default_init.f90 new file mode 100644 index 0000000000000..e2d1f545e35a5 --- /dev/null +++ b/flang/test/Lower/zero_init_default_init.f90 @@ -0,0 +1,22 @@ +! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s +! RUN: %flang_fc1 -finit-global-zero -emit-hlfir -o - %s | FileCheck %s +! RUN: %flang_fc1 -fno-init-global-zero -emit-hlfir -o - %s | FileCheck %s +! RUN: bbc -emit-hlfir -o - %s | FileCheck %s +! RUN: bbc -finit-global-zero -emit-hlfir -o - %s | FileCheck %s +! RUN: bbc -finit-global-zero=false -emit-hlfir -o - %s | FileCheck %s + +! Test that the flag does not affect globals with default init + +module m2 + type val + integer :: my_val = 1 + end type val + type(val) :: v1 +end module m2 + +!CHECK: fir.global @_QMm2Ev1 : !fir.type<_QMm2Tval{my_val:i32}> { +!CHECK: %[[V1:.*]] = fir.undefined !fir.type<_QMm2Tval{my_val:i32}> +!CHECK: %[[ONE:.*]] = arith.constant 1 : i32 +!CHECK: %[[V1_INIT:.*]] = fir.insert_value %[[V1]], %[[ONE]], ["my_val", !fir.type<_QMm2Tval{my_val:i32}>] : (!fir.type<_QMm2Tval{my_val:i32}>, i32) -> !fir.type<_QMm2Tval{my_val:i32}> +!CHECK: fir.has_value %[[V1_INIT]] : !fir.type<_QMm2Tval{my_val:i32}> +!CHECK: } diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp index 7efc460be8679..dafbcd856389a 100644 --- a/flang/tools/bbc/bbc.cpp +++ b/flang/tools/bbc/bbc.cpp @@ -234,6 +234,11 @@ static llvm::cl::opt integerWrapAround( llvm::cl::desc("Treat signed integer overflow as two's complement"), llvm::cl::init(false)); +static llvm::cl::opt initGlobalZero( + "finit-global-zero", + llvm::cl::desc("Zero initialize globals without default initialization"), + llvm::cl::init(true)); + static llvm::cl::opt reallocateLHS("frealloc-lhs", llvm::cl::desc("Follow Fortran 2003 rules for (re)allocating " @@ -381,6 +386,7 @@ static llvm::LogicalResult convertFortranSourceToMLIR( loweringOptions.setNoPPCNativeVecElemOrder(enableNoPPCNativeVecElemOrder); loweringOptions.setLowerToHighLevelFIR(useHLFIR || emitHLFIR); loweringOptions.setIntegerWrapAround(integerWrapAround); + loweringOptions.setInitGlobalZero(initGlobalZero); loweringOptions.setReallocateLHS(reallocateLHS); std::vector envDefaults = {}; Fortran::frontend::TargetOptions targetOpts; From bacfdcd7e0989117a3c76b040fe9efe093fa8708 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 17 Jan 2025 12:22:07 +0000 Subject: [PATCH 38/45] [DAG] Add SDPatternMatch::m_BitCast matcher (#123327) Simplifies a future patch --- llvm/include/llvm/CodeGen/SDPatternMatch.h | 4 ++++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +- llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp | 5 +++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h index 4faa090901a6a..4488a6152117c 100644 --- a/llvm/include/llvm/CodeGen/SDPatternMatch.h +++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h @@ -896,6 +896,10 @@ inline UnaryOpc_match m_ChainedUnaryOp(unsigned Opc, return UnaryOpc_match(Opc, Op); } +template inline UnaryOpc_match m_BitCast(const Opnd &Op) { + return UnaryOpc_match(ISD::BITCAST, Op); +} + template inline UnaryOpc_match m_BSwap(const Opnd &Op) { return UnaryOpc_match(ISD::BSWAP, Op); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index de7fb21f5903e..49e5b7d9ef014 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -15770,7 +15770,7 @@ SDValue DAGCombiner::foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG, // FIXME: I don't think looking for bitcast intrinsically makes sense, but // removing this would require more changes. auto IsBitCastOrFree = [&TLI, FPOpcode](SDValue Op, EVT VT) { - if (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).getValueType() == VT) + if (sd_match(Op, m_BitCast(m_SpecificVT(VT)))) return true; return FPOpcode == ISD::FABS ? TLI.isFAbsFree(VT) : TLI.isFNegFree(VT); diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp index bf9c597d8ac5e..736a36da97f57 100644 --- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp +++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp @@ -392,6 +392,7 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) { SDValue FPToSI = DAG->getNode(ISD::FP_TO_SINT, DL, FloatVT, Op2); SDValue FPToUI = DAG->getNode(ISD::FP_TO_UINT, DL, FloatVT, Op2); + SDValue Bcast = DAG->getNode(ISD::BITCAST, DL, FloatVT, Op0); SDValue Brev = DAG->getNode(ISD::BITREVERSE, DL, Int32VT, Op0); SDValue Bswap = DAG->getNode(ISD::BSWAP, DL, Int32VT, Op0); @@ -423,8 +424,12 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) { EXPECT_FALSE(sd_match(FPToUI, m_FPToSI(m_Value()))); EXPECT_FALSE(sd_match(FPToSI, m_FPToUI(m_Value()))); + EXPECT_TRUE(sd_match(Bcast, m_BitCast(m_Value()))); + EXPECT_TRUE(sd_match(Bcast, m_BitCast(m_SpecificVT(MVT::i32)))); EXPECT_TRUE(sd_match(Brev, m_BitReverse(m_Value()))); EXPECT_TRUE(sd_match(Bswap, m_BSwap(m_Value()))); + EXPECT_FALSE(sd_match(Bcast, m_BitReverse(m_Value()))); + EXPECT_FALSE(sd_match(Bcast, m_BitCast(m_SpecificVT(MVT::f32)))); EXPECT_FALSE(sd_match(Brev, m_BSwap(m_Value()))); EXPECT_FALSE(sd_match(Bswap, m_BitReverse(m_Value()))); From ce3280a64467b5211ced77169f3203c07934e06b Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Fri, 17 Jan 2025 12:25:37 +0000 Subject: [PATCH 39/45] Fix for buildbot errors on non-aarch64 targets. (#123322) Add missing REQUIRES: aarch64-registered-target --- llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll index 90bd98a9b0d38..4b6a19d3f05cf 100644 --- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll +++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll @@ -1,4 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names)" --version 4 + +; REQUIRES: aarch64-registered-target + ; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" From 8a229f595a5c0ff354cdfa05cda974a9d56674df Mon Sep 17 00:00:00 2001 From: Kiran Chandramohan Date: Fri, 17 Jan 2025 12:27:58 +0000 Subject: [PATCH 40/45] =?UTF-8?q?Revert=20"Revert=20"Revert=20"[Flang][Dri?= =?UTF-8?q?ver]=20Add=20a=20flag=20to=20control=20zero=20initializa?= =?UTF-8?q?=E2=80=A6"=20(#123330)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reverts llvm/llvm-project#123097 Reverting due to buildbot failure https://lab.llvm.org/buildbot/#/builders/89/builds/14577. --- clang/include/clang/Driver/Options.td | 5 ----- clang/lib/Driver/ToolChains/Flang.cpp | 6 ++--- flang/include/flang/Lower/LoweringOptions.def | 3 --- flang/lib/Frontend/CompilerInvocation.cpp | 8 ------- flang/lib/Lower/ConvertVariable.cpp | 6 +---- flang/test/Driver/fno-zero-init.f90 | 9 -------- flang/test/Lower/zero_init.f90 | 20 ----------------- flang/test/Lower/zero_init_default_init.f90 | 22 ------------------- flang/tools/bbc/bbc.cpp | 6 ----- 9 files changed, 3 insertions(+), 82 deletions(-) delete mode 100644 flang/test/Driver/fno-zero-init.f90 delete mode 100644 flang/test/Lower/zero_init.f90 delete mode 100644 flang/test/Lower/zero_init_default_init.f90 diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index c4b9743597bb2..d38dd2b4e3cf0 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3505,11 +3505,6 @@ def fno_struct_path_tbaa : Flag<["-"], "fno-struct-path-tbaa">, Group; def fno_strict_enums : Flag<["-"], "fno-strict-enums">, Group; def fno_strict_overflow : Flag<["-"], "fno-strict-overflow">, Group, Visibility<[ClangOption, FlangOption]>; -defm init_global_zero : BoolOptionWithoutMarshalling<"f", "init-global-zero", - PosFlag, - NegFlag>; def fno_pointer_tbaa : Flag<["-"], "fno-pointer-tbaa">, Group; def fno_temp_file : Flag<["-"], "fno-temp-file">, Group, Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>, HelpText< diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index 9c1fd28a3a8a2..86ed25badfa2b 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -155,10 +155,8 @@ void Flang::addCodegenOptions(const ArgList &Args, options::OPT_flang_deprecated_no_hlfir, options::OPT_fno_ppc_native_vec_elem_order, options::OPT_fppc_native_vec_elem_order, - options::OPT_finit_global_zero, - options::OPT_fno_init_global_zero, options::OPT_ftime_report, - options::OPT_ftime_report_EQ, options::OPT_funroll_loops, - options::OPT_fno_unroll_loops}); + options::OPT_ftime_report, options::OPT_ftime_report_EQ, + options::OPT_funroll_loops, options::OPT_fno_unroll_loops}); } void Flang::addPicOptions(const ArgList &Args, ArgStringList &CmdArgs) const { diff --git a/flang/include/flang/Lower/LoweringOptions.def b/flang/include/flang/Lower/LoweringOptions.def index 396c91948be36..5a6debfdffe03 100644 --- a/flang/include/flang/Lower/LoweringOptions.def +++ b/flang/include/flang/Lower/LoweringOptions.def @@ -44,8 +44,5 @@ ENUM_LOWERINGOPT(IntegerWrapAround, unsigned, 1, 0) /// If false, assume that the shapes/types/allocation-status match. ENUM_LOWERINGOPT(ReallocateLHS, unsigned, 1, 1) -/// If true, initialize globals without initialization to zero. -/// On by default. -ENUM_LOWERINGOPT(InitGlobalZero, unsigned, 1, 1) #undef LOWERINGOPT #undef ENUM_LOWERINGOPT diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 3c6da4687f65d..15b1e1e0a2488 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -1377,14 +1377,6 @@ bool CompilerInvocation::createFromArgs( invoc.loweringOpts.setNoPPCNativeVecElemOrder(true); } - // -f[no-]init-global-zero - if (args.hasFlag(clang::driver::options::OPT_finit_global_zero, - clang::driver::options::OPT_fno_init_global_zero, - /*default=*/true)) - invoc.loweringOpts.setInitGlobalZero(true); - else - invoc.loweringOpts.setInitGlobalZero(false); - // Preserve all the remark options requested, i.e. -Rpass, -Rpass-missed or // -Rpass-analysis. This will be used later when processing and outputting the // remarks generated by LLVM in ExecuteCompilerInvocation.cpp. diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index 87236dc293ebb..9ee42d5cd8800 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -635,11 +635,7 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter, global.setLinkName(builder.createCommonLinkage()); Fortran::lower::createGlobalInitialization( builder, global, [&](fir::FirOpBuilder &builder) { - mlir::Value initValue; - if (converter.getLoweringOptions().getInitGlobalZero()) - initValue = builder.create(loc, symTy); - else - initValue = builder.create(loc, symTy); + mlir::Value initValue = builder.create(loc, symTy); builder.create(loc, initValue); }); } diff --git a/flang/test/Driver/fno-zero-init.f90 b/flang/test/Driver/fno-zero-init.f90 deleted file mode 100644 index 2ffa10dd040d5..0000000000000 --- a/flang/test/Driver/fno-zero-init.f90 +++ /dev/null @@ -1,9 +0,0 @@ -! Check that the driver passes through -f[no-]init-global-zero: -! RUN: %flang -### -S -finit-global-zero %s -o - 2>&1 | FileCheck --check-prefix=CHECK-POS %s -! RUN: %flang -### -S -fno-init-global-zero %s -o - 2>&1 | FileCheck --check-prefix=CHECK-NEG %s -! Check that the compiler accepts -f[no-]init-global-zero: -! RUN: %flang_fc1 -emit-hlfir -finit-global-zero %s -o - -! RUN: %flang_fc1 -emit-hlfir -fno-init-global-zero %s -o - - -! CHECK-POS: "-fc1"{{.*}}"-finit-global-zero" -! CHECK-NEG: "-fc1"{{.*}}"-fno-init-global-zero" diff --git a/flang/test/Lower/zero_init.f90 b/flang/test/Lower/zero_init.f90 deleted file mode 100644 index 5ed6f2247de3b..0000000000000 --- a/flang/test/Lower/zero_init.f90 +++ /dev/null @@ -1,20 +0,0 @@ -! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s -! RUN: %flang_fc1 -finit-global-zero -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s -! RUN: %flang_fc1 -fno-init-global-zero -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-NO-ZERO-INIT %s -! RUN: bbc -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s -! RUN: bbc -finit-global-zero -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s -! RUN: bbc -finit-global-zero=false -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-NO-ZERO-INIT %s - -module m1 - real :: x -end module m1 - -!CHECK-DEFAULT: fir.global @_QMm1Ex : f32 { -!CHECK-DEFAULT: %[[UNDEF:.*]] = fir.zero_bits f32 -!CHECK-DEFAULT: fir.has_value %[[UNDEF]] : f32 -!CHECK-DEFAULT: } - -!CHECK-NO-ZERO-INIT: fir.global @_QMm1Ex : f32 { -!CHECK-NO-ZERO-INIT: %[[UNDEF:.*]] = fir.undefined f32 -!CHECK-NO-ZERO-INIT: fir.has_value %[[UNDEF]] : f32 -!CHECK-NO-ZERO-INIT: } diff --git a/flang/test/Lower/zero_init_default_init.f90 b/flang/test/Lower/zero_init_default_init.f90 deleted file mode 100644 index e2d1f545e35a5..0000000000000 --- a/flang/test/Lower/zero_init_default_init.f90 +++ /dev/null @@ -1,22 +0,0 @@ -! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s -! RUN: %flang_fc1 -finit-global-zero -emit-hlfir -o - %s | FileCheck %s -! RUN: %flang_fc1 -fno-init-global-zero -emit-hlfir -o - %s | FileCheck %s -! RUN: bbc -emit-hlfir -o - %s | FileCheck %s -! RUN: bbc -finit-global-zero -emit-hlfir -o - %s | FileCheck %s -! RUN: bbc -finit-global-zero=false -emit-hlfir -o - %s | FileCheck %s - -! Test that the flag does not affect globals with default init - -module m2 - type val - integer :: my_val = 1 - end type val - type(val) :: v1 -end module m2 - -!CHECK: fir.global @_QMm2Ev1 : !fir.type<_QMm2Tval{my_val:i32}> { -!CHECK: %[[V1:.*]] = fir.undefined !fir.type<_QMm2Tval{my_val:i32}> -!CHECK: %[[ONE:.*]] = arith.constant 1 : i32 -!CHECK: %[[V1_INIT:.*]] = fir.insert_value %[[V1]], %[[ONE]], ["my_val", !fir.type<_QMm2Tval{my_val:i32}>] : (!fir.type<_QMm2Tval{my_val:i32}>, i32) -> !fir.type<_QMm2Tval{my_val:i32}> -!CHECK: fir.has_value %[[V1_INIT]] : !fir.type<_QMm2Tval{my_val:i32}> -!CHECK: } diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp index dafbcd856389a..7efc460be8679 100644 --- a/flang/tools/bbc/bbc.cpp +++ b/flang/tools/bbc/bbc.cpp @@ -234,11 +234,6 @@ static llvm::cl::opt integerWrapAround( llvm::cl::desc("Treat signed integer overflow as two's complement"), llvm::cl::init(false)); -static llvm::cl::opt initGlobalZero( - "finit-global-zero", - llvm::cl::desc("Zero initialize globals without default initialization"), - llvm::cl::init(true)); - static llvm::cl::opt reallocateLHS("frealloc-lhs", llvm::cl::desc("Follow Fortran 2003 rules for (re)allocating " @@ -386,7 +381,6 @@ static llvm::LogicalResult convertFortranSourceToMLIR( loweringOptions.setNoPPCNativeVecElemOrder(enableNoPPCNativeVecElemOrder); loweringOptions.setLowerToHighLevelFIR(useHLFIR || emitHLFIR); loweringOptions.setIntegerWrapAround(integerWrapAround); - loweringOptions.setInitGlobalZero(initGlobalZero); loweringOptions.setReallocateLHS(reallocateLHS); std::vector envDefaults = {}; Fortran::frontend::TargetOptions targetOpts; From 22637a877ae7fbfd5cf030400979fd4527eaebcf Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 17 Jan 2025 12:52:24 +0000 Subject: [PATCH 41/45] [Loads] Respect UseDerefAtPointSemantics in isDerefAndAlignedPointer. (#123196) If a pointer gets freed, it may not be dereferenceable any longer, even though there is a dominating dereferenceable assumption. As first step, only consider assumptions if the pointer value cannot be freed if UseDerefAtPointSemantics is used. PR: https://github.com/llvm/llvm-project/pull/123196 --- llvm/lib/Analysis/Loads.cpp | 4 +- llvm/lib/IR/Value.cpp | 2 +- ...able-info-from-assumption-constant-size.ll | 156 +++++++++++++++--- 3 files changed, 138 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 7bbd469bd035d..11ccfa33821ca 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -25,6 +25,8 @@ using namespace llvm; +extern cl::opt UseDerefAtPointSemantics; + static bool isAligned(const Value *Base, Align Alignment, const DataLayout &DL) { return Base->getPointerAlignment(DL) >= Alignment; @@ -168,7 +170,7 @@ static bool isDereferenceableAndAlignedPointer( Size, DL, CtxI, AC, DT, TLI, Visited, MaxDepth); - if (CtxI) { + if (CtxI && (!UseDerefAtPointSemantics || !V->canBeFreed())) { /// Look through assumes to see if both dereferencability and alignment can /// be proven by an assume if needed. RetainedKnowledge AlignRK; diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index 65b63955b6f6d..eddb67282fca4 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -36,7 +36,7 @@ using namespace llvm; -static cl::opt UseDerefAtPointSemantics( +cl::opt UseDerefAtPointSemantics( "use-dereferenceable-at-point-semantics", cl::Hidden, cl::init(false), cl::desc("Deref attributes and metadata infer facts at definition only")); diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll index 572511a5ffb92..90671689f1dce 100644 --- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll +++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -p loop-vectorize -force-vector-width=2 -use-dereferenceable-at-point-semantics=1 -S %s | FileCheck %s +; RUN: opt -p loop-vectorize -force-vector-width=2 -use-dereferenceable-at-point-semantics -S %s | FileCheck %s declare void @llvm.assume(i1) -define void @deref_assumption_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -104,7 +104,7 @@ exit: ret void } -define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -181,7 +181,7 @@ exit: ret void } -define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_too_small_in_header_constant_trip_count( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -282,7 +282,7 @@ exit: ret void } -define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_align_1( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -383,7 +383,7 @@ exit: ret void } -define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attribute(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attribute(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attribute( ; CHECK-SAME: ptr noalias align 4 [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -484,7 +484,7 @@ exit: ret void } -define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_align_not_known( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -585,7 +585,7 @@ exit: ret void } -define void @deref_assumption_in_then_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_in_then_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_then_constant_trip_count( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -682,7 +682,7 @@ exit: ret void } -define void @deref_assumption_in_latch_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_in_latch_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_latch_constant_trip_count( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -785,7 +785,7 @@ exit: ret void } -define void @deref_assumption_in_header_variable_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) nofree { +define void @deref_assumption_in_header_variable_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_header_variable_trip_count( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -890,7 +890,7 @@ exit: ret void } -define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_preheader_constant_trip_count_align_1( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -968,7 +968,7 @@ exit: ret void } -define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -1063,7 +1063,7 @@ exit: ret void } -define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -1142,7 +1142,7 @@ exit: } -define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via_argument_attr(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via_argument_attr(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via_argument_attr( ; CHECK-SAME: ptr noalias align 4 [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -1220,7 +1220,7 @@ exit: ret void } -define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -1315,7 +1315,7 @@ exit: ret void } -define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -1410,8 +1410,8 @@ exit: ret void } -; %a may be freeed between the dereferenceable assumption and accesses. -; FIXME: It is not safe to use with -use-dereferenceable-at-point-semantics. +; %a may be freed between the dereferenceable assumption and accesses. +; It is not safe to use with -use-dereferenceable-at-point-semantics. define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; CHECK-LABEL: define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) { @@ -1422,16 +1422,29 @@ define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_ ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: ; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[A]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP15]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]] +; CHECK: [[PRED_LOAD_IF1]]: +; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP14]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE2]] +; CHECK: [[PRED_LOAD_CONTINUE2]]: +; CHECK-NEXT: [[TMP11:%.*]] = phi <2 x i32> [ [[TMP12]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], %[[PRED_LOAD_IF1]] ] ; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP11]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 @@ -1491,6 +1504,103 @@ exit: ret void } +; %a may be freed between the dereferenceable assumption and accesses. +; It is not safe to use with -use-dereferenceable-at-point-semantics. +define void @may_free_local_ptr_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %b, ptr noalias %c) nofree nosync { +; CHECK-LABEL: define void @may_free_local_ptr_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr( +; CHECK-SAME: ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[A:%.*]] = call ptr @get_ptr() +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4) ] +; CHECK-NEXT: call void @may_free() +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]] +; CHECK: [[PRED_LOAD_IF1]]: +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP10]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE2]] +; CHECK: [[PRED_LOAD_CONTINUE2]]: +; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x i32> [ [[TMP8]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], %[[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP14]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4 +; CHECK-NEXT: [[C_1:%.*]] = icmp sge i32 [[L_B]], 0 +; CHECK-NEXT: br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]] +; CHECK: [[LOOP_THEN]]: +; CHECK-NEXT: [[L_A:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ] +; CHECK-NEXT: [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] +; CHECK-NEXT: store i32 [[MERGE]], ptr [[GEP_C]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP35:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %a = call ptr @get_ptr() + call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4) ] + call void @may_free() + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv + %l.b = load i32, ptr %gep.b, align 4 + %c.1 = icmp sge i32 %l.b, 0 + br i1 %c.1, label %loop.latch, label %loop.then + +loop.then: + %l.a = load i32, ptr %a, align 4 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ] + %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv + store i32 %merge, ptr %gep.c, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} + +declare ptr @get_ptr() declare void @may_free() ;. @@ -1528,4 +1638,6 @@ declare void @may_free() ; CHECK: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]} ; CHECK: [[LOOP32]] = distinct !{[[LOOP32]], [[META1]], [[META2]]} ; CHECK: [[LOOP33]] = distinct !{[[LOOP33]], [[META2]], [[META1]]} +; CHECK: [[LOOP34]] = distinct !{[[LOOP34]], [[META1]], [[META2]]} +; CHECK: [[LOOP35]] = distinct !{[[LOOP35]], [[META2]], [[META1]]} ;. From fb2c9d940ad87e6ae09e06c6915e0c925a4f87ec Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Fri, 17 Jan 2025 21:03:53 +0800 Subject: [PATCH 42/45] [C++20] [Modules] Makes sure internal declaration won't be found by other TU (#123059) Close https://github.com/llvm/llvm-project/issues/61427 And this is also helpful to implement https://github.com/llvm/llvm-project/issues/112294 partially. The implementation strategy mimics https://github.com/llvm/llvm-project/pull/122887. This patch split the internal declarations from the general lookup table so that other TU can't find the internal declarations. --- .../include/clang/Serialization/ASTBitCodes.h | 6 + clang/include/clang/Serialization/ASTReader.h | 20 +- clang/include/clang/Serialization/ASTWriter.h | 11 +- clang/lib/Serialization/ASTReader.cpp | 89 +++++-- clang/lib/Serialization/ASTReaderDecl.cpp | 45 +++- clang/lib/Serialization/ASTWriter.cpp | 225 +++++++++++++----- clang/lib/Serialization/ASTWriterDecl.cpp | 12 +- .../basic.lookup.argdep/p5-ex2.cpp | 4 +- .../basic.scope/basic.scope.namespace/p2.cpp | 12 +- .../CXX/module/basic/basic.def.odr/p4.cppm | 5 - .../test/CXX/module/basic/basic.link/p2.cppm | 13 +- 11 files changed, 327 insertions(+), 115 deletions(-) diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h index 40dae25f7b54b..d568d2fd7aa30 100644 --- a/clang/include/clang/Serialization/ASTBitCodes.h +++ b/clang/include/clang/Serialization/ASTBitCodes.h @@ -740,6 +740,8 @@ enum ASTRecordTypes { CXX_ADDED_TEMPLATE_PARTIAL_SPECIALIZATION = 75, UPDATE_MODULE_LOCAL_VISIBLE = 76, + + UPDATE_TU_LOCAL_VISIBLE = 77, }; /// Record types used within a source manager block. @@ -1340,6 +1342,10 @@ enum DeclCode { /// only visible from DeclContext in the same module. DECL_CONTEXT_MODULE_LOCAL_VISIBLE, + /// A record that stores the set of declarations that are only visible + /// to the TU. + DECL_CONTEXT_TU_LOCAL_VISIBLE, + /// A LabelDecl record. DECL_LABEL, diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index c839215dc4077..82564fe664acb 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -528,6 +528,7 @@ class ASTReader uint64_t LexicalOffset; uint64_t VisibleOffset; uint64_t ModuleLocalOffset; + uint64_t TULocalOffset; }; using DelayedNamespaceOffsetMapTy = @@ -640,6 +641,9 @@ class ASTReader llvm::DenseMap ModuleLocalLookups; + llvm::DenseMap + TULocalLookups; using SpecLookupTableTy = llvm::DenseMap PendingVisibleUpdates; llvm::DenseMap PendingModuleLocalVisibleUpdates; + llvm::DenseMap TULocalUpdates; using SpecializationsUpdate = SmallVector; using SpecializationsUpdateMap = @@ -704,11 +709,17 @@ class ASTReader llvm::BitstreamCursor &Cursor, uint64_t Offset, DeclContext *DC); + enum class VisibleDeclContextStorageKind { + GenerallyVisible, + ModuleLocalVisible, + TULocalVisible, + }; + /// Read the record that describes the visible contents of a DC. bool ReadVisibleDeclContextStorage(ModuleFile &M, llvm::BitstreamCursor &Cursor, uint64_t Offset, GlobalDeclID ID, - bool IsModuleLocal); + VisibleDeclContextStorageKind VisibleKind); bool ReadSpecializations(ModuleFile &M, llvm::BitstreamCursor &Cursor, uint64_t Offset, Decl *D, bool IsPartial); @@ -1148,6 +1159,10 @@ class ASTReader unsigned NumModuleLocalVisibleDeclContexts = 0, TotalModuleLocalVisibleDeclContexts = 0; + /// Number of TU Local decl contexts read/total + unsigned NumTULocalVisibleDeclContexts = 0, + TotalTULocalVisibleDeclContexts = 0; + /// Total size of modules, in bits, currently loaded uint64_t TotalModulesSizeInBits = 0; @@ -1481,6 +1496,9 @@ class ASTReader const serialization::reader::ModuleLocalLookupTable * getModuleLocalLookupTables(DeclContext *Primary) const; + const serialization::reader::DeclContextLookupTable * + getTULocalLookupTables(DeclContext *Primary) const; + /// Get the loaded specializations lookup tables for \p D, /// if any. serialization::reader::LazySpecializationInfoLookupTable * diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h index 53b09cc914392..079e39a9fb678 100644 --- a/clang/include/clang/Serialization/ASTWriter.h +++ b/clang/include/clang/Serialization/ASTWriter.h @@ -496,6 +496,9 @@ class ASTWriter : public ASTDeserializationListener, /// file. unsigned NumModuleLocalDeclContexts = 0; + /// The number of TULocal declcontexts written to the AST file. + unsigned NumTULocalDeclContexts = 0; + /// A mapping from each known submodule to its ID number, which will /// be a positive integer. llvm::DenseMap SubmoduleIDs; @@ -594,12 +597,14 @@ class ASTWriter : public ASTDeserializationListener, void GenerateNameLookupTable(ASTContext &Context, const DeclContext *DC, llvm::SmallVectorImpl &LookupTable, - llvm::SmallVectorImpl &ModuleLocalLookupTable); + llvm::SmallVectorImpl &ModuleLocalLookupTable, + llvm::SmallVectorImpl &TULocalLookupTable); uint64_t WriteDeclContextLexicalBlock(ASTContext &Context, const DeclContext *DC); void WriteDeclContextVisibleBlock(ASTContext &Context, DeclContext *DC, uint64_t &VisibleBlockOffset, - uint64_t &ModuleLocalBlockOffset); + uint64_t &ModuleLocalBlockOffset, + uint64_t &TULocalBlockOffset); void WriteTypeDeclOffsets(); void WriteFileDeclIDsMap(); void WriteComments(ASTContext &Context); @@ -633,8 +638,10 @@ class ASTWriter : public ASTDeserializationListener, unsigned DeclContextLexicalAbbrev = 0; unsigned DeclContextVisibleLookupAbbrev = 0; unsigned DeclModuleLocalVisibleLookupAbbrev = 0; + unsigned DeclTULocalLookupAbbrev = 0; unsigned UpdateVisibleAbbrev = 0; unsigned ModuleLocalUpdateVisibleAbbrev = 0; + unsigned TULocalUpdateVisibleAbbrev = 0; unsigned DeclRecordAbbrev = 0; unsigned DeclTypedefAbbrev = 0; unsigned DeclVarAbbrev = 0; diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index d08dc6b1b4d93..a72ff766685bb 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -1425,10 +1425,9 @@ bool ASTReader::ReadLexicalDeclContextStorage(ModuleFile &M, return false; } -bool ASTReader::ReadVisibleDeclContextStorage(ModuleFile &M, - BitstreamCursor &Cursor, - uint64_t Offset, GlobalDeclID ID, - bool IsModuleLocal) { +bool ASTReader::ReadVisibleDeclContextStorage( + ModuleFile &M, BitstreamCursor &Cursor, uint64_t Offset, GlobalDeclID ID, + ASTReader::VisibleDeclContextStorageKind VisibleKind) { assert(Offset != 0); SavedStreamPosition SavedPosition(Cursor); @@ -1452,22 +1451,42 @@ bool ASTReader::ReadVisibleDeclContextStorage(ModuleFile &M, return true; } unsigned RecCode = MaybeRecCode.get(); - if (!IsModuleLocal && RecCode != DECL_CONTEXT_VISIBLE) { - Error("Expected visible lookup table block"); - return true; - } - if (IsModuleLocal && RecCode != DECL_CONTEXT_MODULE_LOCAL_VISIBLE) { - Error("Expected module local visible lookup table block"); - return true; + switch (VisibleKind) { + case VisibleDeclContextStorageKind::GenerallyVisible: + if (RecCode != DECL_CONTEXT_VISIBLE) { + Error("Expected visible lookup table block"); + return true; + } + break; + case VisibleDeclContextStorageKind::ModuleLocalVisible: + if (RecCode != DECL_CONTEXT_MODULE_LOCAL_VISIBLE) { + Error("Expected module local visible lookup table block"); + return true; + } + break; + case VisibleDeclContextStorageKind::TULocalVisible: + if (RecCode != DECL_CONTEXT_TU_LOCAL_VISIBLE) { + Error("Expected TU local lookup table block"); + return true; + } + break; } // We can't safely determine the primary context yet, so delay attaching the // lookup table until we're done with recursive deserialization. auto *Data = (const unsigned char*)Blob.data(); - if (!IsModuleLocal) + switch (VisibleKind) { + case VisibleDeclContextStorageKind::GenerallyVisible: PendingVisibleUpdates[ID].push_back(UpdateData{&M, Data}); - else + break; + case VisibleDeclContextStorageKind::ModuleLocalVisible: PendingModuleLocalVisibleUpdates[ID].push_back(UpdateData{&M, Data}); + break; + case VisibleDeclContextStorageKind::TULocalVisible: + if (M.Kind == MK_MainFile) + TULocalUpdates[ID].push_back(UpdateData{&M, Data}); + break; + } return false; } @@ -3613,6 +3632,21 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F, break; } + case UPDATE_TU_LOCAL_VISIBLE: { + if (F.Kind != MK_MainFile) + break; + unsigned Idx = 0; + GlobalDeclID ID = ReadDeclID(F, Record, Idx); + auto *Data = (const unsigned char *)Blob.data(); + TULocalUpdates[ID].push_back(UpdateData{&F, Data}); + // If we've already loaded the decl, perform the updates when we finish + // loading this block. + if (Decl *D = GetExistingDecl(ID)) + PendingUpdateRecords.push_back( + PendingUpdateRecord(ID, D, /*JustLoaded=*/false)); + break; + } + case CXX_ADDED_TEMPLATE_SPECIALIZATION: { unsigned Idx = 0; GlobalDeclID ID = ReadDeclID(F, Record, Idx); @@ -3717,6 +3751,7 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F, TotalLexicalDeclContexts += Record[2]; TotalVisibleDeclContexts += Record[3]; TotalModuleLocalVisibleDeclContexts += Record[4]; + TotalTULocalVisibleDeclContexts += Record[5]; break; case UNUSED_FILESCOPED_DECLS: @@ -4002,7 +4037,7 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F, break; case DELAYED_NAMESPACE_LEXICAL_VISIBLE_RECORD: { - if (Record.size() % 4 != 0) + if (Record.size() % 5 != 0) return llvm::createStringError( std::errc::illegal_byte_sequence, "invalid DELAYED_NAMESPACE_LEXICAL_VISIBLE_RECORD block in AST " @@ -4021,9 +4056,12 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F, uint64_t LocalModuleLocalOffset = Record[I++]; uint64_t ModuleLocalOffset = LocalModuleLocalOffset ? BaseOffset + LocalModuleLocalOffset : 0; + uint64_t TULocalLocalOffset = Record[I++]; + uint64_t TULocalOffset = + TULocalLocalOffset ? BaseOffset + TULocalLocalOffset : 0; DelayedNamespaceOffsetMap[ID] = {LexicalOffset, VisibleOffset, - ModuleLocalOffset}; + ModuleLocalOffset, TULocalOffset}; assert(!GetExistingDecl(ID) && "We shouldn't load the namespace in the front of delayed " @@ -8473,6 +8511,15 @@ bool ASTReader::FindExternalVisibleDeclsByName(const DeclContext *DC, } } + if (auto It = TULocalLookups.find(DC); It != TULocalLookups.end()) { + ++NumTULocalVisibleDeclContexts; + for (GlobalDeclID ID : It->second.Table.find(Name)) { + NamedDecl *ND = cast(GetDecl(ID)); + if (ND->getDeclName() == Name && Found.insert(ND).second) + Decls.push_back(ND); + } + } + SetExternalVisibleDeclsForName(DC, Name, Decls); return !Decls.empty(); } @@ -8500,6 +8547,7 @@ void ASTReader::completeVisibleDeclsMap(const DeclContext *DC) { findAll(Lookups, NumVisibleDeclContextsRead); findAll(ModuleLocalLookups, NumModuleLocalVisibleDeclContexts); + findAll(TULocalLookups, NumTULocalVisibleDeclContexts); for (DeclsMap::iterator I = Decls.begin(), E = Decls.end(); I != E; ++I) { SetExternalVisibleDeclsForName(DC, I->first, I->second); @@ -8519,6 +8567,12 @@ ASTReader::getModuleLocalLookupTables(DeclContext *Primary) const { return I == ModuleLocalLookups.end() ? nullptr : &I->second; } +const serialization::reader::DeclContextLookupTable * +ASTReader::getTULocalLookupTables(DeclContext *Primary) const { + auto I = TULocalLookups.find(Primary); + return I == TULocalLookups.end() ? nullptr : &I->second; +} + serialization::reader::LazySpecializationInfoLookupTable * ASTReader::getLoadedSpecializationsLookupTables(const Decl *D, bool IsPartial) { assert(D->isCanonicalDecl()); @@ -8634,6 +8688,11 @@ void ASTReader::PrintStats() { NumModuleLocalVisibleDeclContexts, TotalModuleLocalVisibleDeclContexts, ((float)NumModuleLocalVisibleDeclContexts / TotalModuleLocalVisibleDeclContexts * 100)); + if (TotalTULocalVisibleDeclContexts) + std::fprintf(stderr, " %u/%u visible declcontexts in GMF read (%f%%)\n", + NumTULocalVisibleDeclContexts, TotalTULocalVisibleDeclContexts, + ((float)NumTULocalVisibleDeclContexts / + TotalTULocalVisibleDeclContexts * 100)); if (TotalNumMethodPoolEntries) std::fprintf(stderr, " %u/%u method pool entries read (%f%%)\n", NumMethodPoolEntriesRead, TotalNumMethodPoolEntries, diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index 06dff02ac6128..de834285fa76b 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -414,7 +414,8 @@ class ASTDeclReader : public DeclVisitor { void VisitLifetimeExtendedTemporaryDecl(LifetimeExtendedTemporaryDecl *D); void VisitDeclContext(DeclContext *DC, uint64_t &LexicalOffset, - uint64_t &VisibleOffset, uint64_t &ModuleLocalOffset); + uint64_t &VisibleOffset, uint64_t &ModuleLocalOffset, + uint64_t &TULocalOffset); template RedeclarableResult VisitRedeclarable(Redeclarable *D); @@ -1859,7 +1860,9 @@ void ASTDeclReader::VisitHLSLBufferDecl(HLSLBufferDecl *D) { uint64_t LexicalOffset = 0; uint64_t VisibleOffset = 0; uint64_t ModuleLocalOffset = 0; - VisitDeclContext(D, LexicalOffset, VisibleOffset, ModuleLocalOffset); + uint64_t TULocalOffset = 0; + VisitDeclContext(D, LexicalOffset, VisibleOffset, ModuleLocalOffset, + TULocalOffset); D->IsCBuffer = Record.readBool(); D->KwLoc = readSourceLocation(); D->LBraceLoc = readSourceLocation(); @@ -2770,10 +2773,12 @@ void ASTDeclReader::VisitLifetimeExtendedTemporaryDecl( void ASTDeclReader::VisitDeclContext(DeclContext *DC, uint64_t &LexicalOffset, uint64_t &VisibleOffset, - uint64_t &ModuleLocalOffset) { + uint64_t &ModuleLocalOffset, + uint64_t &TULocalOffset) { LexicalOffset = ReadLocalOffset(); VisibleOffset = ReadLocalOffset(); ModuleLocalOffset = ReadLocalOffset(); + TULocalOffset = ReadLocalOffset(); } template @@ -3903,6 +3908,7 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) { case DECL_CONTEXT_LEXICAL: case DECL_CONTEXT_VISIBLE: case DECL_CONTEXT_MODULE_LOCAL_VISIBLE: + case DECL_CONTEXT_TU_LOCAL_VISIBLE: case DECL_SPECIALIZATIONS: case DECL_PARTIAL_SPECIALIZATIONS: llvm_unreachable("Record cannot be de-serialized with readDeclRecord"); @@ -4213,9 +4219,10 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) { uint64_t LexicalOffset = 0; uint64_t VisibleOffset = 0; uint64_t ModuleLocalOffset = 0; + uint64_t TULocalOffset = 0; - Reader.VisitDeclContext(DC, LexicalOffset, VisibleOffset, - ModuleLocalOffset); + Reader.VisitDeclContext(DC, LexicalOffset, VisibleOffset, ModuleLocalOffset, + TULocalOffset); // Get the lexical and visible block for the delayed namespace. // It is sufficient to judge if ID is in DelayedNamespaceOffsetMap. @@ -4227,18 +4234,24 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) { LexicalOffset = Iter->second.LexicalOffset; VisibleOffset = Iter->second.VisibleOffset; ModuleLocalOffset = Iter->second.ModuleLocalOffset; + TULocalOffset = Iter->second.TULocalOffset; } if (LexicalOffset && ReadLexicalDeclContextStorage(*Loc.F, DeclsCursor, LexicalOffset, DC)) return nullptr; - if (VisibleOffset && - ReadVisibleDeclContextStorage(*Loc.F, DeclsCursor, VisibleOffset, ID, - /*IsModuleLocal=*/false)) + if (VisibleOffset && ReadVisibleDeclContextStorage( + *Loc.F, DeclsCursor, VisibleOffset, ID, + VisibleDeclContextStorageKind::GenerallyVisible)) return nullptr; if (ModuleLocalOffset && - ReadVisibleDeclContextStorage(*Loc.F, DeclsCursor, ModuleLocalOffset, - ID, /*IsModuleLocal=*/true)) + ReadVisibleDeclContextStorage( + *Loc.F, DeclsCursor, ModuleLocalOffset, ID, + VisibleDeclContextStorageKind::ModuleLocalVisible)) + return nullptr; + if (TULocalOffset && ReadVisibleDeclContextStorage( + *Loc.F, DeclsCursor, TULocalOffset, ID, + VisibleDeclContextStorageKind::TULocalVisible)) return nullptr; } assert(Record.getIdx() == Record.size()); @@ -4404,6 +4417,18 @@ void ASTReader::loadDeclUpdateRecords(PendingUpdateRecord &Record) { DC->setHasExternalVisibleStorage(true); } + if (auto I = TULocalUpdates.find(ID); I != TULocalUpdates.end()) { + auto Updates = std::move(I->second); + TULocalUpdates.erase(I); + + auto *DC = cast(D)->getPrimaryContext(); + for (const auto &Update : Updates) + TULocalLookups[DC].Table.add( + Update.Mod, Update.Data, + reader::ASTDeclContextNameLookupTrait(*this, *Update.Mod)); + DC->setHasExternalVisibleStorage(true); + } + // Load any pending related decls. if (D->isCanonicalDecl()) { if (auto IT = RelatedDeclsMap.find(ID); IT != RelatedDeclsMap.end()) { diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 1c4f5730df312..c7c17e09a30e0 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -4047,6 +4047,13 @@ class ASTDeclContextNameLookupTraitBase { : Writer(Writer) {} public: + data_type getData(const DeclIDsTy &LocalIDs) { + unsigned Start = DeclIDs.size(); + for (auto ID : LocalIDs) + DeclIDs.push_back(ID); + return std::make_pair(Start, DeclIDs.size()); + } + data_type ImportData(const reader::ASTDeclContextNameLookupTrait::data_type &FromReader) { unsigned Start = DeclIDs.size(); DeclIDs.insert( @@ -4139,23 +4146,16 @@ class ASTDeclContextNameLookupTraitBase { } }; -class ModuleLocalNameLookupTrait : public ASTDeclContextNameLookupTraitBase { +class ModuleLevelNameLookupTrait : public ASTDeclContextNameLookupTraitBase { public: using primary_module_hash_type = unsigned; using key_type = std::pair; using key_type_ref = key_type; - explicit ModuleLocalNameLookupTrait(ASTWriter &Writer) + explicit ModuleLevelNameLookupTrait(ASTWriter &Writer) : ASTDeclContextNameLookupTraitBase(Writer) {} - data_type getData(const DeclIDsTy &LocalIDs) { - unsigned Start = DeclIDs.size(); - for (auto ID : LocalIDs) - DeclIDs.push_back(ID); - return std::make_pair(Start, DeclIDs.size()); - } - static bool EqualKey(key_type_ref a, key_type_ref b) { return a == b; } hash_value_type ComputeHash(key_type Key) { @@ -4203,19 +4203,46 @@ static bool isModuleLocalDecl(NamedDecl *D) { return false; } +static bool isTULocalInNamedModules(NamedDecl *D) { + Module *NamedModule = D->getTopLevelOwningNamedModule(); + if (!NamedModule) + return false; + + // For none-top level decls, we choose to move it to the general visible + // lookup table. Since the consumer may get its parent somehow and performs + // a lookup in it (considering looking up the operator function in lambda). + // The difference between module local lookup table and TU local lookup table + // is, the consumers still have a chance to lookup in the module local lookup + // table but **now** the consumers won't read the TU local lookup table if + // the consumer is not the original TU. + // + // FIXME: It seems to be an optimization chance (and also a more correct + // semantics) to remain the TULocal lookup table and performing similar lookup + // with the module local lookup table except that we only allow the lookups + // with the same module unit. + if (!D->getNonTransparentDeclContext()->isFileContext()) + return false; + + return D->getLinkageInternal() == Linkage::Internal; +} + // Trait used for the on-disk hash table used in the method pool. +template class ASTDeclContextNameLookupTrait : public ASTDeclContextNameLookupTraitBase { public: - using ModuleLocalDeclsMapTy = - llvm::DenseMap; - -private: - ModuleLocalDeclsMapTy ModuleLocalDeclsMap; + using ModuleLevelDeclsMapTy = + llvm::DenseMap; -public: using key_type = DeclarationNameKey; using key_type_ref = key_type; + using TULocalDeclsMapTy = llvm::DenseMap; + +private: + ModuleLevelDeclsMapTy ModuleLocalDeclsMap; + TULocalDeclsMapTy TULocalDeclsMap; + +public: explicit ASTDeclContextNameLookupTrait(ASTWriter &Writer) : ASTDeclContextNameLookupTraitBase(Writer) {} @@ -4251,15 +4278,30 @@ class ASTDeclContextNameLookupTrait : public ASTDeclContextNameLookupTraitBase { } } + if constexpr (CollectingTULocalDecls) { + if (isTULocalInNamedModules(D)) { + auto Iter = TULocalDeclsMap.find(D->getDeclName()); + if (Iter == TULocalDeclsMap.end()) + TULocalDeclsMap.insert({D->getDeclName(), DeclIDsTy{ID}}); + else + Iter->second.push_back(ID); + continue; + } + } + DeclIDs.push_back(ID); } return std::make_pair(Start, DeclIDs.size()); } - const ModuleLocalDeclsMapTy &getModuleLocalDecls() { + using ASTDeclContextNameLookupTraitBase::getData; + + const ModuleLevelDeclsMapTy &getModuleLocalDecls() { return ModuleLocalDeclsMap; } + const TULocalDeclsMapTy &getTULocalDecls() { return TULocalDeclsMap; } + static bool EqualKey(key_type_ref a, key_type_ref b) { return a == b; } hash_value_type ComputeHash(key_type Name) { return Name.getHash(); } @@ -4487,7 +4529,8 @@ static bool isLookupResultNotInteresting(ASTWriter &Writer, void ASTWriter::GenerateNameLookupTable( ASTContext &Context, const DeclContext *ConstDC, llvm::SmallVectorImpl &LookupTable, - llvm::SmallVectorImpl &ModuleLocalLookupTable) { + llvm::SmallVectorImpl &ModuleLocalLookupTable, + llvm::SmallVectorImpl &TULookupTable) { assert(!ConstDC->hasLazyLocalLexicalLookups() && !ConstDC->hasLazyExternalLexicalLookups() && "must call buildLookups first"); @@ -4497,9 +4540,11 @@ void ASTWriter::GenerateNameLookupTable( assert(DC == DC->getPrimaryContext() && "only primary DC has lookup table"); // Create the on-disk hash table representation. - MultiOnDiskHashTableGenerator Generator; - ASTDeclContextNameLookupTrait Trait(*this); + MultiOnDiskHashTableGenerator< + reader::ASTDeclContextNameLookupTrait, + ASTDeclContextNameLookupTrait> + Generator; + ASTDeclContextNameLookupTrait Trait(*this); // The first step is to collect the declaration names which we need to // serialize into the name lookup table, and to collect them in a stable @@ -4671,26 +4716,45 @@ void ASTWriter::GenerateNameLookupTable( Generator.emit(LookupTable, Trait, Lookups ? &Lookups->Table : nullptr); const auto &ModuleLocalDecls = Trait.getModuleLocalDecls(); - if (ModuleLocalDecls.empty()) - return; + if (!ModuleLocalDecls.empty()) { + MultiOnDiskHashTableGenerator + ModuleLocalLookupGenerator; + ModuleLevelNameLookupTrait ModuleLocalTrait(*this); + + for (const auto &ModuleLocalIter : ModuleLocalDecls) { + const auto &Key = ModuleLocalIter.first; + const auto &IDs = ModuleLocalIter.second; + ModuleLocalLookupGenerator.insert(Key, ModuleLocalTrait.getData(IDs), + ModuleLocalTrait); + } - MultiOnDiskHashTableGenerator - ModuleLocalLookupGenerator; - ModuleLocalNameLookupTrait ModuleLocalTrait(*this); + auto *ModuleLocalLookups = + Chain ? Chain->getModuleLocalLookupTables(DC) : nullptr; + ModuleLocalLookupGenerator.emit( + ModuleLocalLookupTable, ModuleLocalTrait, + ModuleLocalLookups ? &ModuleLocalLookups->Table : nullptr); + } + + const auto &TULocalDecls = Trait.getTULocalDecls(); + if (!TULocalDecls.empty() && !isGeneratingReducedBMI()) { + MultiOnDiskHashTableGenerator< + reader::ASTDeclContextNameLookupTrait, + ASTDeclContextNameLookupTrait> + TULookupGenerator; + ASTDeclContextNameLookupTrait TULocalTrait( + *this); + + for (const auto &TULocalIter : TULocalDecls) { + const auto &Key = TULocalIter.first; + const auto &IDs = TULocalIter.second; + TULookupGenerator.insert(Key, TULocalTrait.getData(IDs), TULocalTrait); + } - for (const auto &ModuleLocalIter : ModuleLocalDecls) { - const auto &Key = ModuleLocalIter.first; - const auto &IDs = ModuleLocalIter.second; - ModuleLocalLookupGenerator.insert(Key, ModuleLocalTrait.getData(IDs), - ModuleLocalTrait); + auto *TULocalLookups = Chain ? Chain->getTULocalLookupTables(DC) : nullptr; + TULookupGenerator.emit(TULookupTable, TULocalTrait, + TULocalLookups ? &TULocalLookups->Table : nullptr); } - - auto *ModuleLocalLookups = - Chain ? Chain->getModuleLocalLookupTables(DC) : nullptr; - ModuleLocalLookupGenerator.emit( - ModuleLocalLookupTable, ModuleLocalTrait, - ModuleLocalLookups ? &ModuleLocalLookups->Table : nullptr); } /// Write the block containing all of the declaration IDs @@ -4701,7 +4765,12 @@ void ASTWriter::GenerateNameLookupTable( void ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context, DeclContext *DC, uint64_t &VisibleBlockOffset, - uint64_t &ModuleLocalBlockOffset) { + uint64_t &ModuleLocalBlockOffset, + uint64_t &TULocalBlockOffset) { + assert(VisibleBlockOffset == 0); + assert(ModuleLocalBlockOffset == 0); + assert(TULocalBlockOffset == 0); + // If we imported a key declaration of this namespace, write the visible // lookup results as an update record for it rather than including them // on this declaration. We will only look at key declarations on reload. @@ -4788,7 +4857,9 @@ void ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context, // Create the on-disk hash table in a buffer. SmallString<4096> LookupTable; SmallString<4096> ModuleLocalLookupTable; - GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable); + SmallString<4096> TULookupTable; + GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable, + TULookupTable); // Write the lookup table RecordData::value_type Record[] = {DECL_CONTEXT_VISIBLE}; @@ -4796,17 +4867,26 @@ void ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context, LookupTable); ++NumVisibleDeclContexts; - if (ModuleLocalLookupTable.empty()) - return; + if (!ModuleLocalLookupTable.empty()) { + ModuleLocalBlockOffset = Stream.GetCurrentBitNo(); + assert(ModuleLocalBlockOffset > VisibleBlockOffset); + // Write the lookup table + RecordData::value_type ModuleLocalRecord[] = { + DECL_CONTEXT_MODULE_LOCAL_VISIBLE}; + Stream.EmitRecordWithBlob(DeclModuleLocalVisibleLookupAbbrev, + ModuleLocalRecord, ModuleLocalLookupTable); + ++NumModuleLocalDeclContexts; + } - ModuleLocalBlockOffset = Stream.GetCurrentBitNo(); - assert(ModuleLocalBlockOffset > VisibleBlockOffset); - // Write the lookup table - RecordData::value_type ModuleLocalRecord[] = { - DECL_CONTEXT_MODULE_LOCAL_VISIBLE}; - Stream.EmitRecordWithBlob(DeclModuleLocalVisibleLookupAbbrev, - ModuleLocalRecord, ModuleLocalLookupTable); - ++NumModuleLocalDeclContexts; + if (!TULookupTable.empty()) { + TULocalBlockOffset = Stream.GetCurrentBitNo(); + // Write the lookup table + RecordData::value_type TULocalDeclsRecord[] = { + DECL_CONTEXT_TU_LOCAL_VISIBLE}; + Stream.EmitRecordWithBlob(DeclTULocalLookupAbbrev, TULocalDeclsRecord, + TULookupTable); + ++NumTULocalDeclContexts; + } } /// Write an UPDATE_VISIBLE block for the given context. @@ -4824,7 +4904,9 @@ void ASTWriter::WriteDeclContextVisibleUpdate(ASTContext &Context, // Create the on-disk hash table in a buffer. SmallString<4096> LookupTable; SmallString<4096> ModuleLocalLookupTable; - GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable); + SmallString<4096> TULookupTable; + GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable, + TULookupTable); // If we're updating a namespace, select a key declaration as the key for the // update record; those are the only ones that will be checked on reload. @@ -4836,14 +4918,20 @@ void ASTWriter::WriteDeclContextVisibleUpdate(ASTContext &Context, getDeclID(cast(DC)).getRawValue()}; Stream.EmitRecordWithBlob(UpdateVisibleAbbrev, Record, LookupTable); - if (ModuleLocalLookupTable.empty()) - return; + if (!ModuleLocalLookupTable.empty()) { + // Write the module local lookup table + RecordData::value_type ModuleLocalRecord[] = { + UPDATE_MODULE_LOCAL_VISIBLE, getDeclID(cast(DC)).getRawValue()}; + Stream.EmitRecordWithBlob(ModuleLocalUpdateVisibleAbbrev, ModuleLocalRecord, + ModuleLocalLookupTable); + } - // Write the module local lookup table - RecordData::value_type ModuleLocalRecord[] = { - UPDATE_MODULE_LOCAL_VISIBLE, getDeclID(cast(DC)).getRawValue()}; - Stream.EmitRecordWithBlob(ModuleLocalUpdateVisibleAbbrev, ModuleLocalRecord, - ModuleLocalLookupTable); + if (!TULookupTable.empty()) { + RecordData::value_type GMFRecord[] = { + UPDATE_TU_LOCAL_VISIBLE, getDeclID(cast(DC)).getRawValue()}; + Stream.EmitRecordWithBlob(TULocalUpdateVisibleAbbrev, GMFRecord, + TULookupTable); + } } /// Write an FP_PRAGMA_OPTIONS block for the given FPOptions. @@ -6031,9 +6119,12 @@ ASTFileSignature ASTWriter::WriteASTCore(Sema *SemaPtr, StringRef isysroot, } // Some simple statistics - RecordData::value_type Record[] = { - NumStatements, NumMacros, NumLexicalDeclContexts, NumVisibleDeclContexts, - NumModuleLocalDeclContexts}; + RecordData::value_type Record[] = {NumStatements, + NumMacros, + NumLexicalDeclContexts, + NumVisibleDeclContexts, + NumModuleLocalDeclContexts, + NumTULocalDeclContexts}; Stream.EmitRecord(STATISTICS, Record); Stream.ExitBlock(); Stream.FlushToWord(); @@ -6112,7 +6203,9 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) { uint64_t LexicalOffset = WriteDeclContextLexicalBlock(Context, NS); uint64_t VisibleOffset = 0; uint64_t ModuleLocalOffset = 0; - WriteDeclContextVisibleBlock(Context, NS, VisibleOffset, ModuleLocalOffset); + uint64_t TULocalOffset = 0; + WriteDeclContextVisibleBlock(Context, NS, VisibleOffset, ModuleLocalOffset, + TULocalOffset); // Write the offset relative to current block. if (LexicalOffset) @@ -6124,10 +6217,14 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) { if (ModuleLocalOffset) ModuleLocalOffset -= DeclTypesBlockStartOffset; + if (TULocalOffset) + TULocalOffset -= DeclTypesBlockStartOffset; + AddDeclRef(NS, DelayedNamespaceRecord); DelayedNamespaceRecord.push_back(LexicalOffset); DelayedNamespaceRecord.push_back(VisibleOffset); DelayedNamespaceRecord.push_back(ModuleLocalOffset); + DelayedNamespaceRecord.push_back(TULocalOffset); } // The process of writing lexical and visible block for delayed namespace @@ -6213,6 +6310,12 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) { Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::Blob)); ModuleLocalUpdateVisibleAbbrev = Stream.EmitAbbrev(std::move(Abv)); + Abv = std::make_shared(); + Abv->Add(llvm::BitCodeAbbrevOp(UPDATE_TU_LOCAL_VISIBLE)); + Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::VBR, 6)); + Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::Blob)); + TULocalUpdateVisibleAbbrev = Stream.EmitAbbrev(std::move(Abv)); + // And a visible updates block for the translation unit. WriteDeclContextVisibleUpdate(Context, TU); diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index 7a494cfe1ac64..30b28057f4c10 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -2069,6 +2069,7 @@ void ASTDeclWriter::VisitDeclContext(DeclContext *DC) { uint64_t LexicalOffset = 0; uint64_t VisibleOffset = 0; uint64_t ModuleLocalOffset = 0; + uint64_t TULocalOffset = 0; if (Writer.isGeneratingReducedBMI() && isa(DC) && cast(DC)->isFromExplicitGlobalModule()) { @@ -2080,12 +2081,14 @@ void ASTDeclWriter::VisitDeclContext(DeclContext *DC) { LexicalOffset = Writer.WriteDeclContextLexicalBlock(Record.getASTContext(), DC); Writer.WriteDeclContextVisibleBlock(Record.getASTContext(), DC, - VisibleOffset, ModuleLocalOffset); + VisibleOffset, ModuleLocalOffset, + TULocalOffset); } Record.AddOffset(LexicalOffset); Record.AddOffset(VisibleOffset); Record.AddOffset(ModuleLocalOffset); + Record.AddOffset(TULocalOffset); } const Decl *ASTWriter::getFirstLocalDecl(const Decl *D) { @@ -2441,6 +2444,7 @@ void ASTWriter::WriteDeclAbbrevs() { Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LexicalOffset Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // VisibleOffset Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ModuleLocalOffset + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // TULocalOffset DeclEnumAbbrev = Stream.EmitAbbrev(std::move(Abv)); // Abbreviation for DECL_RECORD @@ -2494,6 +2498,7 @@ void ASTWriter::WriteDeclAbbrevs() { Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LexicalOffset Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // VisibleOffset Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ModuleLocalOffset + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // TULocalOffset DeclRecordAbbrev = Stream.EmitAbbrev(std::move(Abv)); // Abbreviation for DECL_PARM_VAR @@ -2836,6 +2841,11 @@ void ASTWriter::WriteDeclAbbrevs() { Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); DeclModuleLocalVisibleLookupAbbrev = Stream.EmitAbbrev(std::move(Abv)); + Abv = std::make_shared(); + Abv->Add(BitCodeAbbrevOp(serialization::DECL_CONTEXT_TU_LOCAL_VISIBLE)); + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); + DeclTULocalLookupAbbrev = Stream.EmitAbbrev(std::move(Abv)); + Abv = std::make_shared(); Abv->Add(BitCodeAbbrevOp(serialization::DECL_SPECIALIZATIONS)); Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); diff --git a/clang/test/CXX/basic/basic.lookup/basic.lookup.argdep/p5-ex2.cpp b/clang/test/CXX/basic/basic.lookup/basic.lookup.argdep/p5-ex2.cpp index a27946bd90a46..c200abafc0af8 100644 --- a/clang/test/CXX/basic/basic.lookup/basic.lookup.argdep/p5-ex2.cpp +++ b/clang/test/CXX/basic/basic.lookup/basic.lookup.argdep/p5-ex2.cpp @@ -61,6 +61,6 @@ void test() { // error: S::f is visible in instantiation context, but R::g has internal // linkage and cannot be used outside N.cpp - apply(x, S::Z()); // expected-error@N.cpp:10 {{no matching function for call to 'g'}} - // expected-note@-1 {{in instantiation of function template specialization 'apply' requested here}} + apply(x, S::Z()); // expected-error@N.cpp:10 {{use of undeclared identifier 'g'}} + // expected-note@-1 {{requested here}} } diff --git a/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp b/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp index 54ec6aa61ec37..d70eb7de22c6a 100644 --- a/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp +++ b/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp @@ -66,11 +66,7 @@ void test_late() { // expected-note@p2.cpp:18 {{'exported' declared here}} #endif - internal = 1; -#ifndef IMPLEMENTATION - // expected-error@-2 {{declaration of 'internal' must be imported from module 'A' before it is required}} - // expected-note@p2.cpp:20 {{declaration here is not visible}} -#endif + internal = 1; // expected-error {{use of undeclared identifier 'internal'}} not_exported_private = 1; #ifndef IMPLEMENTATION @@ -78,11 +74,7 @@ void test_late() { // expected-error@-3 {{undeclared identifier}} #endif - internal_private = 1; -#ifndef IMPLEMENTATION - // FIXME: should not be visible here - // expected-error@-3 {{undeclared identifier}} -#endif + internal_private = 1; // expected-error {{use of undeclared identifier 'internal_private'}} } #endif diff --git a/clang/test/CXX/module/basic/basic.def.odr/p4.cppm b/clang/test/CXX/module/basic/basic.def.odr/p4.cppm index 487dbdef283ee..7e88cbe78b4e3 100644 --- a/clang/test/CXX/module/basic/basic.def.odr/p4.cppm +++ b/clang/test/CXX/module/basic/basic.def.odr/p4.cppm @@ -128,7 +128,6 @@ void f(a::b, a::c) {} // // CHECK-DAG: @_ZW6Module25extern_var_module_linkage = external {{(dso_local )?}}global // CHECK-DAG: @_ZW6Module25inline_var_module_linkage = linkonce_odr {{(dso_local )?}}global -// CHECK-DAG: @_ZL25static_var_module_linkage = internal {{(dso_local )?}}global i32 0, // CHECK-DAG: @_ZW6Module24const_var_module_linkage = available_externally {{(dso_local )?}}constant i32 3, module Module; @@ -152,10 +151,6 @@ void use() { (void)&extern_var_module_linkage; (void)&inline_var_module_linkage; - // FIXME: Issue #61427 Internal-linkage declarations in the interface TU - // should not be not visible here. - (void)&static_var_module_linkage; // FIXME: Should not be visible here. - (void)&const_var_module_linkage; // FIXME: will be visible after P2788R0 } diff --git a/clang/test/CXX/module/basic/basic.link/p2.cppm b/clang/test/CXX/module/basic/basic.link/p2.cppm index 5a497304201dc..d7d2b5992a235 100644 --- a/clang/test/CXX/module/basic/basic.link/p2.cppm +++ b/clang/test/CXX/module/basic/basic.link/p2.cppm @@ -45,16 +45,14 @@ module M; void use_from_module_impl() { external_linkage_fn(); module_linkage_fn(); - internal_linkage_fn(); // expected-error {{no matching function for call to 'internal_linkage_fn'}} + internal_linkage_fn(); // expected-error {{use of undeclared identifier 'internal_linkage_fn'}} // expected-note@* {{}} (void)external_linkage_class{}; (void)module_linkage_class{}; (void)external_linkage_var; (void)module_linkage_var; - // FIXME: Issue #61427 Internal-linkage declarations in the interface TU - // should not be not visible here. - (void)internal_linkage_class{}; - (void)internal_linkage_var; + (void)internal_linkage_class{}; // expected-error {{use of undeclared identifier 'internal_linkage_class'}} //expected-error{{}} + (void)internal_linkage_var; // expected-error {{use of undeclared identifier 'internal_linkage_var'}} } //--- user.cpp @@ -63,11 +61,10 @@ import M; void use_from_module_impl() { external_linkage_fn(); module_linkage_fn(); // expected-error {{use of undeclared identifier 'module_linkage_fn'}} - internal_linkage_fn(); // expected-error {{declaration of 'internal_linkage_fn' must be imported}} + internal_linkage_fn(); // expected-error {{use of undeclared identifier 'internal_linkage_fn'}} (void)external_linkage_class{}; - (void)module_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}} + (void)module_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}} // expected-note@* {{}} (void)internal_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}} - // expected-note@M.cppm:10 {{declaration here is not visible}} (void)external_linkage_var; (void)module_linkage_var; // expected-error {{undeclared identifier}} (void)internal_linkage_var; // expected-error {{undeclared identifier}} From 41f430a48db992477534b65b288b47d487c4797d Mon Sep 17 00:00:00 2001 From: Wesley Wiser Date: Fri, 17 Jan 2025 07:09:00 -0600 Subject: [PATCH 43/45] [X86] Don't fold very large offsets into addr displacements during ISel (#121678) Doing so can cause the resulting displacement after frame layout to become inexpressible (or cause over/underflow currently during frame layout). Fixes the error reported in https://github.com/llvm/llvm-project/pull/101840#issuecomment-2306975944. --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 18 +++++---- llvm/test/CodeGen/X86/dag-large-offset.ll | 47 +++++++++++++++++++++++ llvm/test/CodeGen/X86/xor-lea.ll | 3 +- 3 files changed, 60 insertions(+), 8 deletions(-) create mode 100644 llvm/test/CodeGen/X86/dag-large-offset.ll diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 9b340a778b36a..84bcdae520885 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -1800,10 +1800,10 @@ void X86DAGToDAGISel::emitFunctionEntryCode() { emitSpecialCodeForMain(); } -static bool isDispSafeForFrameIndex(int64_t Val) { - // On 64-bit platforms, we can run into an issue where a frame index +static bool isDispSafeForFrameIndexOrRegBase(int64_t Val) { + // We can run into an issue where a frame index or a register base // includes a displacement that, when added to the explicit displacement, - // will overflow the displacement field. Assuming that the frame index + // will overflow the displacement field. Assuming that the // displacement fits into a 31-bit integer (which is only slightly more // aggressive than the current fundamental assumption that it fits into // a 32-bit integer), a 31-bit disp should always be safe. @@ -1831,7 +1831,7 @@ bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, // In addition to the checks required for a register base, check that // we do not try to use an unsafe Disp with a frame index. if (AM.BaseType == X86ISelAddressMode::FrameIndexBase && - !isDispSafeForFrameIndex(Val)) + !isDispSafeForFrameIndexOrRegBase(Val)) return true; // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to // 64 bits. Instructions with 32-bit register addresses perform this zero @@ -1849,10 +1849,14 @@ bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, // to get an address size override to be emitted. However, this // pseudo-register is not part of any register class and therefore causes // MIR verification to fail. - if (Subtarget->isTarget64BitILP32() && !isUInt<31>(Val) && + if (Subtarget->isTarget64BitILP32() && + !isDispSafeForFrameIndexOrRegBase((uint32_t)Val) && !AM.hasBaseOrIndexReg()) return true; - } + } else if (AM.hasBaseOrIndexReg() && !isDispSafeForFrameIndexOrRegBase(Val)) + // For 32-bit X86, make sure the displacement still isn't close to the + // expressible limit. + return true; AM.Disp = Val; return false; } @@ -2553,7 +2557,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, case ISD::FrameIndex: if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() == nullptr && - (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) { + (!Subtarget->is64Bit() || isDispSafeForFrameIndexOrRegBase(AM.Disp))) { AM.BaseType = X86ISelAddressMode::FrameIndexBase; AM.Base_FrameIndex = cast(N)->getIndex(); return false; diff --git a/llvm/test/CodeGen/X86/dag-large-offset.ll b/llvm/test/CodeGen/X86/dag-large-offset.ll new file mode 100644 index 0000000000000..2774a93993153 --- /dev/null +++ b/llvm/test/CodeGen/X86/dag-large-offset.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=i386 --frame-pointer=all | FileCheck %s + +; ISel will try to fold pointer arithmetic into the address displacement. However, we don't +; want to do that if the offset is very close to the expressible limit because the final frame +; layout may push it over/under the limit. + +define i32 @foo(i1 %b) #0 { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .cfi_offset %ebp, -8 +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: .cfi_def_cfa_register %ebp +; CHECK-NEXT: subl $8, %esp +; CHECK-NEXT: movl __stack_chk_guard, %eax +; CHECK-NEXT: movl %eax, -4(%ebp) +; CHECK-NEXT: testb $1, 8(%ebp) +; CHECK-NEXT: jne .LBB0_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: jmp .LBB0_3 +; CHECK-NEXT: .LBB0_1: +; CHECK-NEXT: movl $-2147483647, %eax # imm = 0x80000001 +; CHECK-NEXT: leal -5(%ebp,%eax), %eax +; CHECK-NEXT: .LBB0_3: # %entry +; CHECK-NEXT: movl __stack_chk_guard, %ecx +; CHECK-NEXT: cmpl -4(%ebp), %ecx +; CHECK-NEXT: jne .LBB0_5 +; CHECK-NEXT: # %bb.4: # %entry +; CHECK-NEXT: addl $8, %esp +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: .cfi_def_cfa %esp, 4 +; CHECK-NEXT: retl +; CHECK-NEXT: .LBB0_5: # %entry +; CHECK-NEXT: .cfi_def_cfa %ebp, 8 +; CHECK-NEXT: calll __stack_chk_fail +entry: + %a = alloca i8, align 1 + %0 = ptrtoint ptr %a to i32 + %sub = add i32 %0, -2147483647 + %retval.0 = select i1 %b, i32 %sub, i32 0 + ret i32 %retval.0 +} + +attributes #0 = { sspreq } diff --git a/llvm/test/CodeGen/X86/xor-lea.ll b/llvm/test/CodeGen/X86/xor-lea.ll index 10e9525a2706a..d50752e48d293 100644 --- a/llvm/test/CodeGen/X86/xor-lea.ll +++ b/llvm/test/CodeGen/X86/xor-lea.ll @@ -327,7 +327,8 @@ define i32 @xor_shl_sminval_i32(i32 %x) { ; X86-LABEL: xor_shl_sminval_i32: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal -2147483648(,%eax,8), %eax +; X86-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 +; X86-NEXT: leal (%ecx,%eax,8), %eax ; X86-NEXT: retl ; ; X64-LABEL: xor_shl_sminval_i32: From 5153a90453e692b834e38eec247a0c88a0678bfa Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Fri, 17 Jan 2025 13:09:52 +0000 Subject: [PATCH 44/45] [lldb][DWARF] Change GetAttributes to always visit current DIE before recursing (#123261) `GetAttributes` returns all attributes on a given DIE, including any attributes that the DIE references via `DW_AT_abstract_origin` and `DW_AT_specification`. However, if an attribute exists on both the referring DIE and the referenced DIE, the first one encountered will be the one that takes precendence when querying the returned `DWARFAttributes`. But there was no guarantee in which order those attributes get visited. That means there's no convenient way of ensuring that an attribute of a definition doesn't get shadowed by one found on the declaration. One use-case where we don't want this to happen is for `DW_AT_object_pointer` (which can exist on both definitions and declarations, see https://github.com/llvm/llvm-project/pull/123089). This patch makes sure we visit the current DIE's attributes before following DIE references. I tried keeping as much of the original `GetAttributes` unchanged and just add an outer `GetAttributes` that keeps track of the DIEs we need to visit next. There's precendent for this iteration order in `llvm::DWARFDie::findRecursively` and also `lldb_private::ElaboratingDIEIterator`. We could use the latter to implement `GetAttributes`, though it also follows `DW_AT_signature` so I decided to leave it for follow-up. --- .../SymbolFile/DWARF/DWARFDebugInfoEntry.cpp | 83 ++- .../SymbolFile/DWARF/DWARFDebugInfoEntry.h | 32 +- .../SymbolFile/DWARF/SymbolFileDWARF.cpp | 5 +- .../SymbolFile/DWARF/DWARFDIETest.cpp | 640 ++++++++++++++++++ 4 files changed, 727 insertions(+), 33 deletions(-) diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp index 6d073411de876..c2edc52aa964f 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp @@ -281,22 +281,34 @@ bool DWARFDebugInfoEntry::GetDIENamesAndRanges( return !ranges.empty(); } -// Get all attribute values for a given DIE, including following any -// specification or abstract origin attributes and including those in the -// results. Any duplicate attributes will have the first instance take -// precedence (this can happen for declaration attributes). -void DWARFDebugInfoEntry::GetAttributes(DWARFUnit *cu, - DWARFAttributes &attributes, - Recurse recurse, - uint32_t curr_depth) const { - const auto *abbrevDecl = GetAbbreviationDeclarationPtr(cu); - if (!abbrevDecl) { - attributes.Clear(); - return; - } +/// Helper for the public \ref DWARFDebugInfoEntry::GetAttributes API. +/// Adds all attributes of the DIE at the top of the \c worklist to the +/// \c attributes list. Specifcations and abstract origins are added +/// to the \c worklist if the referenced DIE has not been seen before. +static bool GetAttributes(llvm::SmallVector &worklist, + llvm::SmallSet &seen, + DWARFAttributes &attributes) { + assert(!worklist.empty() && "Need at least one DIE to visit."); + assert(seen.size() >= 1 && + "Need to have seen at least the currently visited entry."); + + DWARFDIE current = worklist.pop_back_val(); + + const auto *cu = current.GetCU(); + assert(cu); + + const auto *entry = current.GetDIE(); + assert(entry); + + const auto *abbrevDecl = + entry->GetAbbreviationDeclarationPtr(current.GetCU()); + if (!abbrevDecl) + return false; const DWARFDataExtractor &data = cu->GetData(); - lldb::offset_t offset = GetFirstAttributeOffset(); + lldb::offset_t offset = current.GetDIE()->GetFirstAttributeOffset(); + + const bool is_first_die = seen.size() == 1; for (const auto &attribute : abbrevDecl->attributes()) { DWARFFormValue form_value(cu); @@ -309,10 +321,10 @@ void DWARFDebugInfoEntry::GetAttributes(DWARFUnit *cu, switch (attr) { case DW_AT_sibling: case DW_AT_declaration: - if (curr_depth > 0) { + if (!is_first_die) { // This attribute doesn't make sense when combined with the DIE that // references this DIE. We know a DIE is referencing this DIE because - // curr_depth is not zero + // we've visited more than one DIE already. break; } [[fallthrough]]; @@ -321,13 +333,12 @@ void DWARFDebugInfoEntry::GetAttributes(DWARFUnit *cu, break; } - if (recurse == Recurse::yes && - ((attr == DW_AT_specification) || (attr == DW_AT_abstract_origin))) { + if (attr == DW_AT_specification || attr == DW_AT_abstract_origin) { if (form_value.ExtractValue(data, &offset)) { - DWARFDIE spec_die = form_value.Reference(); - if (spec_die) - spec_die.GetDIE()->GetAttributes(spec_die.GetCU(), attributes, - recurse, curr_depth + 1); + if (DWARFDIE spec_die = form_value.Reference()) { + if (seen.insert(spec_die.GetDIE()).second) + worklist.push_back(spec_die); + } } } else { const dw_form_t form = form_value.Form(); @@ -339,6 +350,34 @@ void DWARFDebugInfoEntry::GetAttributes(DWARFUnit *cu, DWARFFormValue::SkipValue(form, data, &offset, cu); } } + + return true; +} + +DWARFAttributes DWARFDebugInfoEntry::GetAttributes(const DWARFUnit *cu, + Recurse recurse) const { + // FIXME: use ElaboratingDIEIterator to follow specifications/abstract origins + // instead of maintaining our own worklist/seen list. + + DWARFAttributes attributes; + + llvm::SmallVector worklist; + worklist.emplace_back(cu, this); + + // Keep track if DIEs already seen to prevent infinite recursion. + // Value of '3' was picked for the same reason that + // DWARFDie::findRecursively does. + llvm::SmallSet seen; + seen.insert(this); + + do { + if (!::GetAttributes(worklist, seen, attributes)) { + attributes.Clear(); + break; + } + } while (!worklist.empty() && recurse == Recurse::yes); + + return attributes; } // GetAttributeValue diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h index de6bbf1d52789..72aeb2743b1e2 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h @@ -52,12 +52,28 @@ class DWARFDebugInfoEntry { lldb::offset_t *offset_ptr); using Recurse = DWARFBaseDIE::Recurse; - DWARFAttributes GetAttributes(DWARFUnit *cu, - Recurse recurse = Recurse::yes) const { - DWARFAttributes attrs; - GetAttributes(cu, attrs, recurse, 0 /* curr_depth */); - return attrs; - } + + /// Get all attribute values for a given DIE, optionally following any + /// specifications and abstract origins and including their attributes + /// in the result too. + /// + /// When following specifications/abstract origins, the attributes + /// on the referring DIE are guaranteed to be visited before the attributes of + /// the referenced DIE. + /// + /// \param[in] cu DWARFUnit that this entry belongs to. + /// + /// \param[in] recurse If set to \c Recurse::yes, will include attributes + /// on DIEs referenced via \c DW_AT_specification and \c DW_AT_abstract_origin + /// (including across multiple levels of indirection). + /// + /// \returns DWARFAttributes that include all attributes found on this DIE + /// (and possibly referenced DIEs). Attributes may appear multiple times + /// (e.g., if a declaration and definition both specify the same attribute). + /// On failure, the returned DWARFAttributes will be empty. + /// + DWARFAttributes GetAttributes(const DWARFUnit *cu, + Recurse recurse = Recurse::yes) const; dw_offset_t GetAttributeValue(const DWARFUnit *cu, const dw_attr_t attr, DWARFFormValue &formValue, @@ -178,10 +194,6 @@ class DWARFDebugInfoEntry { /// A copy of the DW_TAG value so we don't have to go through the compile /// unit abbrev table dw_tag_t m_tag = llvm::dwarf::DW_TAG_null; - -private: - void GetAttributes(DWARFUnit *cu, DWARFAttributes &attrs, Recurse recurse, - uint32_t curr_depth) const; }; } // namespace dwarf } // namespace lldb_private::plugin diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp index 2f451d173c4dd..ad5005b660c64 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp @@ -3414,7 +3414,10 @@ VariableSP SymbolFileDWARF::ParseVariableDIE(const SymbolContext &sc, mangled = form_value.AsCString(); break; case DW_AT_type: - type_die_form = form_value; + // DW_AT_type on declaration may be less accurate than + // that of definition, so don't overwrite it. + if (!type_die_form.IsValid()) + type_die_form = form_value; break; case DW_AT_external: is_external = form_value.Boolean(); diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp index 1e4c8f3ba0778..3f61d1607073c 100644 --- a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp +++ b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp @@ -394,3 +394,643 @@ TEST(DWARFDIETest, GetContextInFunction) { EXPECT_THAT(foo_struct_die.GetTypeLookupContext(), testing::ElementsAre(make_struct("struct_t"))); } + +struct GetAttributesTestFixture : public testing::TestWithParam {}; + +TEST_P(GetAttributesTestFixture, TestGetAttributes_IterationOrder) { + // Tests that we accumulate all current DIE's attributes first + // before checking the attributes of the specification. + + const char *yamldata = R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 +DWARF: + debug_str: + - func + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Code: 0x2 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_high_pc + Form: DW_FORM_data4 + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_declaration + Form: DW_FORM_flag_present + - Attribute: DW_AT_external + Form: DW_FORM_flag_present + - Attribute: DW_AT_low_pc + Form: DW_FORM_data4 + - Code: 0x3 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_high_pc + Form: DW_FORM_data4 + - Attribute: {0} + Form: DW_FORM_ref4 + - Attribute: DW_AT_low_pc + Form: DW_FORM_data4 + debug_info: + - Version: 5 + UnitType: DW_UT_compile + AddrSize: 8 + Entries: + +# DW_TAG_compile_unit +# DW_AT_language [DW_FORM_data2] (DW_LANG_C_plus_plus) + + - AbbrCode: 0x1 + Values: + - Value: 0x04 + +# DW_TAG_subprogram +# DW_AT_high_pc [DW_FORM_data4] +# DW_AT_name [DW_FORM_strp] ("func") +# DW_AT_low_pc [DW_FORM_data4] + - AbbrCode: 0x2 + Values: + - Value: 0xdeadbeef + - Value: 0x0 + - Value: 0x1 + - Value: 0x1 + - Value: 0xdeadbeef + +# DW_TAG_subprogram +# DW_AT_high_pc [DW_FORM_data4] +# DW_AT_specification [DW_FORM_ref4] ("func") +# DW_AT_low_pc [DW_FORM_data4] + - AbbrCode: 0x3 + Values: + - Value: 0xf00dcafe + - Value: 0xf + - Value: 0xf00dcafe + + - AbbrCode: 0x0 +... +)"; + YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str()); + + DWARFUnit *unit = t.GetDwarfUnit(); + ASSERT_NE(unit, nullptr); + const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE(); + ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit); + ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus); + DWARFDIE cu_die(unit, cu_entry); + + auto declaration = cu_die.GetFirstChild(); + ASSERT_TRUE(declaration.IsValid()); + ASSERT_EQ(declaration.Tag(), DW_TAG_subprogram); + + auto definition = declaration.GetSibling(); + ASSERT_TRUE(definition.IsValid()); + ASSERT_EQ(definition.Tag(), DW_TAG_subprogram); + ASSERT_FALSE(definition.GetAttributeValueAsOptionalUnsigned(DW_AT_external)); + + auto attrs = definition.GetAttributes(DWARFDebugInfoEntry::Recurse::yes); + EXPECT_EQ(attrs.Size(), 7U); + + // Check that the attributes on the definition (that are also present + // on the declaration) take precedence. + for (auto attr : {DW_AT_low_pc, DW_AT_high_pc}) { + auto idx = attrs.FindAttributeIndex(attr); + EXPECT_NE(idx, UINT32_MAX); + + DWARFFormValue form_value; + auto success = attrs.ExtractFormValueAtIndex(idx, form_value); + EXPECT_TRUE(success); + + EXPECT_EQ(form_value.Unsigned(), 0xf00dcafe); + } +} + +TEST_P(GetAttributesTestFixture, TestGetAttributes_Cycle) { + // Tests that GetAttributes can deal with cycles in + // specifications/abstract origins. + // + // Contrived example: + // + // func1 -> func3 + // ^ | + // | v + // +------func2 + + const char *yamldata = R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 +DWARF: + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Code: 0x2 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: {0} + Form: DW_FORM_ref4 + debug_info: + - Version: 5 + UnitType: DW_UT_compile + AddrSize: 8 + Entries: + + - AbbrCode: 0x1 + Values: + - Value: 0x04 + + - AbbrCode: 0x2 + Values: + - Value: 0x19 + + - AbbrCode: 0x2 + Values: + - Value: 0xf + + - AbbrCode: 0x2 + Values: + - Value: 0x14 + + - AbbrCode: 0x0 +... +)"; + YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str()); + + DWARFUnit *unit = t.GetDwarfUnit(); + ASSERT_NE(unit, nullptr); + const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE(); + ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit); + ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus); + DWARFDIE cu_die(unit, cu_entry); + + auto func1 = cu_die.GetFirstChild(); + ASSERT_TRUE(func1.IsValid()); + ASSERT_EQ(func1.Tag(), DW_TAG_subprogram); + + auto func2 = func1.GetSibling(); + ASSERT_TRUE(func2.IsValid()); + ASSERT_EQ(func2.Tag(), DW_TAG_subprogram); + + auto func3 = func2.GetSibling(); + ASSERT_TRUE(func3.IsValid()); + ASSERT_EQ(func3.Tag(), DW_TAG_subprogram); + + auto attrs = func1.GetAttributes(DWARFDebugInfoEntry::Recurse::yes); + EXPECT_EQ(attrs.Size(), 3U); + + // Confirm that the specifications do form a cycle. + { + DWARFFormValue form_value; + auto success = attrs.ExtractFormValueAtIndex(0, form_value); + ASSERT_TRUE(success); + + EXPECT_EQ(form_value.Reference(), func3); + } + + { + DWARFFormValue form_value; + auto success = attrs.ExtractFormValueAtIndex(1, form_value); + ASSERT_TRUE(success); + + EXPECT_EQ(form_value.Reference(), func2); + } + + { + DWARFFormValue form_value; + auto success = attrs.ExtractFormValueAtIndex(2, form_value); + ASSERT_TRUE(success); + + EXPECT_EQ(form_value.Reference(), func1); + } +} + +TEST_P(GetAttributesTestFixture, + TestGetAttributes_SkipNonApplicableAttributes) { + // Tests that GetAttributes will omit attributes found through + // specifications/abstract origins which are not applicable. + + const char *yamldata = R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 +DWARF: + debug_str: + - func + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Code: 0x2 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_declaration + Form: DW_FORM_flag_present + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_sibling + Form: DW_FORM_ref4 + - Code: 0x3 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_declaration + Form: DW_FORM_flag_present + - Attribute: {0} + Form: DW_FORM_ref4 + - Attribute: DW_AT_sibling + Form: DW_FORM_ref4 + debug_info: + - Version: 5 + UnitType: DW_UT_compile + AddrSize: 8 + Entries: + +# DW_TAG_compile_unit +# DW_AT_language [DW_FORM_data2] (DW_LANG_C_plus_plus) + + - AbbrCode: 0x1 + Values: + - Value: 0x04 + +# DW_TAG_subprogram +# DW_AT_declaration +# DW_AT_name [DW_FORM_strp] ("func") +# DW_AT_sibling + - AbbrCode: 0x2 + Values: + - Value: 0x1 + - Value: 0x0 + - Value: 0x18 + +# DW_TAG_subprogram +# DW_AT_declaration +# DW_AT_specification [DW_FORM_ref4] ("func") +# DW_AT_sibling + - AbbrCode: 0x3 + Values: + - Value: 0x1 + - Value: 0xf + - Value: 0xdeadbeef + + - AbbrCode: 0x0 +... +)"; + YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str()); + + DWARFUnit *unit = t.GetDwarfUnit(); + ASSERT_NE(unit, nullptr); + const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE(); + ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit); + ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus); + DWARFDIE cu_die(unit, cu_entry); + + auto declaration = cu_die.GetFirstChild(); + ASSERT_TRUE(declaration.IsValid()); + ASSERT_EQ(declaration.Tag(), DW_TAG_subprogram); + + auto definition = declaration.GetSibling(); + ASSERT_TRUE(definition.IsValid()); + ASSERT_EQ(definition.Tag(), DW_TAG_subprogram); + + auto attrs = definition.GetAttributes(DWARFDebugInfoEntry::Recurse::yes); + EXPECT_EQ(attrs.Size(), 4U); + EXPECT_NE(attrs.FindAttributeIndex(DW_AT_name), UINT32_MAX); + EXPECT_NE(attrs.FindAttributeIndex(GetParam()), UINT32_MAX); + + auto sibling_idx = attrs.FindAttributeIndex(DW_AT_sibling); + EXPECT_NE(sibling_idx, UINT32_MAX); + + DWARFFormValue form_value; + auto success = attrs.ExtractFormValueAtIndex(sibling_idx, form_value); + ASSERT_TRUE(success); + + EXPECT_EQ(form_value.Unsigned(), 0xdeadbeef); +} + +TEST_P(GetAttributesTestFixture, TestGetAttributes_NoRecurse) { + // Tests that GetAttributes will not recurse if Recurse::No is passed to it. + + const char *yamldata = R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 +DWARF: + debug_str: + - func + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Code: 0x2 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Code: 0x3 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_low_pc + Form: DW_FORM_data4 + - Attribute: {0} + Form: DW_FORM_ref4 + debug_info: + - Version: 5 + UnitType: DW_UT_compile + AddrSize: 8 + Entries: + +# DW_TAG_compile_unit +# DW_AT_language [DW_FORM_data2] (DW_LANG_C_plus_plus) + + - AbbrCode: 0x1 + Values: + - Value: 0x04 + +# DW_TAG_subprogram +# DW_AT_name [DW_FORM_strp] ("func") + - AbbrCode: 0x2 + Values: + - Value: 0x0 + +# DW_TAG_subprogram +# DW_AT_low_pc [DW_FORM_data4] +# DW_AT_specification [DW_FORM_ref4] + - AbbrCode: 0x3 + Values: + - Value: 0xdeadbeef + - Value: 0xf + + - AbbrCode: 0x0 +... +)"; + YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str()); + + DWARFUnit *unit = t.GetDwarfUnit(); + ASSERT_NE(unit, nullptr); + const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE(); + ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit); + ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus); + DWARFDIE cu_die(unit, cu_entry); + + auto declaration = cu_die.GetFirstChild(); + ASSERT_TRUE(declaration.IsValid()); + ASSERT_EQ(declaration.Tag(), DW_TAG_subprogram); + + auto definition = declaration.GetSibling(); + ASSERT_TRUE(definition.IsValid()); + ASSERT_EQ(definition.Tag(), DW_TAG_subprogram); + + auto attrs = definition.GetAttributes(DWARFDebugInfoEntry::Recurse::no); + EXPECT_EQ(attrs.Size(), 2U); + EXPECT_EQ(attrs.FindAttributeIndex(DW_AT_name), UINT32_MAX); + EXPECT_NE(attrs.FindAttributeIndex(GetParam()), UINT32_MAX); + EXPECT_NE(attrs.FindAttributeIndex(DW_AT_low_pc), UINT32_MAX); +} + +TEST_P(GetAttributesTestFixture, TestGetAttributes_InvalidSpec) { + // Test that GetAttributes doesn't try following invalid + // specifications (but still add it to the list of attributes). + + const char *yamldata = R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 +DWARF: + debug_str: + - func + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Code: 0x2 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Code: 0x3 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: {0} + Form: DW_FORM_ref4 + debug_info: + - Version: 5 + UnitType: DW_UT_compile + AddrSize: 8 + Entries: + +# DW_TAG_compile_unit +# DW_AT_language [DW_FORM_data2] (DW_LANG_C_plus_plus) + + - AbbrCode: 0x1 + Values: + - Value: 0x04 + +# DW_TAG_subprogram +# DW_AT_name [DW_FORM_strp] ("func") + - AbbrCode: 0x2 + Values: + - Value: 0x0 + +# DW_TAG_subprogram +# DW_AT_specification [DW_FORM_ref4] + - AbbrCode: 0x3 + Values: + - Value: 0xdeadbeef + + - AbbrCode: 0x0 +... +)"; + YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str()); + + DWARFUnit *unit = t.GetDwarfUnit(); + ASSERT_NE(unit, nullptr); + const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE(); + ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit); + ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus); + DWARFDIE cu_die(unit, cu_entry); + + auto declaration = cu_die.GetFirstChild(); + ASSERT_TRUE(declaration.IsValid()); + ASSERT_EQ(declaration.Tag(), DW_TAG_subprogram); + + auto definition = declaration.GetSibling(); + ASSERT_TRUE(definition.IsValid()); + ASSERT_EQ(definition.Tag(), DW_TAG_subprogram); + + auto attrs = definition.GetAttributes(DWARFDebugInfoEntry::Recurse::yes); + EXPECT_EQ(attrs.Size(), 1U); + EXPECT_EQ(attrs.FindAttributeIndex(DW_AT_name), UINT32_MAX); + EXPECT_NE(attrs.FindAttributeIndex(GetParam()), UINT32_MAX); +} + +TEST(DWARFDIETest, TestGetAttributes_Worklist) { + // Test that GetAttributes will follow both the abstract origin + // and specification on a single DIE correctly (omitting non-applicable + // attributes in the process). + + // Contrived example where + // f1---> f2 --> f4 + // `-> f3 `-> f5 + // + const char *yamldata = R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 +DWARF: + debug_str: + - foo + - bar + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Code: 0x2 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_specification + Form: DW_FORM_ref4 + - Attribute: DW_AT_abstract_origin + Form: DW_FORM_ref4 + - Code: 0x3 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_declaration + Form: DW_FORM_flag_present + - Attribute: DW_AT_artificial + Form: DW_FORM_flag_present + + debug_info: + - Version: 5 + UnitType: DW_UT_compile + AddrSize: 8 + Entries: + + - AbbrCode: 0x1 + Values: + - Value: 0x04 + +# DW_TAG_subprogram ("f1") +# DW_AT_specification [DW_FORM_ref4] ("f2") +# DW_AT_abstract_origin [DW_FORM_ref4] ("f3") + - AbbrCode: 0x2 + Values: + - Value: 0x18 + - Value: 0x21 + +# DW_TAG_subprogram ("f2") +# DW_AT_specification [DW_FORM_ref4] ("f4") +# DW_AT_abstract_origin [DW_FORM_ref4] ("f5") + - AbbrCode: 0x2 + Values: + - Value: 0x22 + - Value: 0x23 + +# DW_TAG_subprogram ("f3") +# DW_AT_declaration [DW_FORM_flag_present] +# DW_AT_artificial [DW_FORM_flag_present] + - AbbrCode: 0x3 + Values: + - Value: 0x1 + - Value: 0x1 + +# DW_TAG_subprogram ("f4") +# DW_AT_declaration [DW_FORM_flag_present] +# DW_AT_artificial [DW_FORM_flag_present] + - AbbrCode: 0x3 + Values: + - Value: 0x1 + - Value: 0x1 + +# DW_TAG_subprogram ("f5") +# DW_AT_declaration [DW_FORM_flag_present] +# DW_AT_artificial [DW_FORM_flag_present] + - AbbrCode: 0x3 + Values: + - Value: 0x1 + - Value: 0x1 + + - AbbrCode: 0x0 +... +)"; + YAMLModuleTester t(yamldata); + + DWARFUnit *unit = t.GetDwarfUnit(); + ASSERT_NE(unit, nullptr); + const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE(); + ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit); + ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus); + DWARFDIE cu_die(unit, cu_entry); + + auto f1 = cu_die.GetFirstChild(); + ASSERT_TRUE(f1.IsValid()); + ASSERT_EQ(f1.Tag(), DW_TAG_subprogram); + + auto attrs = f1.GetAttributes(DWARFDebugInfoEntry::Recurse::yes); + EXPECT_EQ(attrs.Size(), 7U); + EXPECT_EQ(attrs.FindAttributeIndex(DW_AT_declaration), UINT32_MAX); +} + +INSTANTIATE_TEST_SUITE_P(GetAttributeTests, GetAttributesTestFixture, + testing::Values(DW_AT_specification, + DW_AT_abstract_origin)); From eff6b642583ace53aaed7947b92a43bcba283866 Mon Sep 17 00:00:00 2001 From: David Green Date: Fri, 17 Jan 2025 13:19:11 +0000 Subject: [PATCH 45/45] [AArch64][GlobalISel] Update and regenerate some vecreduce and other tests. NFC --- .../GlobalISel/legalize-reduce-add.mir | 112 +- llvm/test/CodeGen/AArch64/aarch64-addv.ll | 230 +-- llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll | 178 +- .../AArch64/vec-combine-compare-to-bitmask.ll | 1751 +++++++++-------- 4 files changed, 1129 insertions(+), 1142 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir index 253e6ebe793ce..76fdfd0c301f6 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir @@ -6,15 +6,15 @@ tracksRegLiveness: true body: | bb.1: liveins: $x0 - ; CHECK-LABEL: name: add_v16s8 ; CHECK: liveins: $x0 - ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: [[LOAD:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<16 x s8>)) - ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s8) = G_VECREDUCE_ADD [[LOAD]](<16 x s8>) - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[VECREDUCE_ADD]](s8) - ; CHECK: $w0 = COPY [[ANYEXT]](s32) - ; CHECK: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<16 x s8>)) + ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s8) = G_VECREDUCE_ADD [[LOAD]](<16 x s8>) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[VECREDUCE_ADD]](s8) + ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 %0:_(p0) = COPY $x0 %1:_(<16 x s8>) = G_LOAD %0(p0) :: (load (<16 x s8>)) %2:_(s8) = G_VECREDUCE_ADD %1(<16 x s8>) @@ -29,15 +29,15 @@ tracksRegLiveness: true body: | bb.1: liveins: $x0 - ; CHECK-LABEL: name: add_v8s16 ; CHECK: liveins: $x0 - ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p0) :: (load (<8 x s16>)) - ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s16) = G_VECREDUCE_ADD [[LOAD]](<8 x s16>) - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[VECREDUCE_ADD]](s16) - ; CHECK: $w0 = COPY [[ANYEXT]](s32) - ; CHECK: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p0) :: (load (<8 x s16>)) + ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s16) = G_VECREDUCE_ADD [[LOAD]](<8 x s16>) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[VECREDUCE_ADD]](s16) + ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 %0:_(p0) = COPY $x0 %1:_(<8 x s16>) = G_LOAD %0(p0) :: (load (<8 x s16>)) %2:_(s16) = G_VECREDUCE_ADD %1(<8 x s16>) @@ -52,14 +52,14 @@ tracksRegLiveness: true body: | bb.1: liveins: $x0 - ; CHECK-LABEL: name: add_v4s32 ; CHECK: liveins: $x0 - ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>)) - ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[LOAD]](<4 x s32>) - ; CHECK: $w0 = COPY [[VECREDUCE_ADD]](s32) - ; CHECK: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>)) + ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[LOAD]](<4 x s32>) + ; CHECK-NEXT: $w0 = COPY [[VECREDUCE_ADD]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 %0:_(p0) = COPY $x0 %1:_(<4 x s32>) = G_LOAD %0(p0) :: (load (<4 x s32>)) %2:_(s32) = G_VECREDUCE_ADD %1(<4 x s32>) @@ -73,14 +73,14 @@ tracksRegLiveness: true body: | bb.1: liveins: $x0 - ; CHECK-LABEL: name: add_v2s64 ; CHECK: liveins: $x0 - ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>)) - ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[LOAD]](<2 x s64>) - ; CHECK: $x0 = COPY [[VECREDUCE_ADD]](s64) - ; CHECK: RET_ReallyLR implicit $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>)) + ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[LOAD]](<2 x s64>) + ; CHECK-NEXT: $x0 = COPY [[VECREDUCE_ADD]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 %0:_(p0) = COPY $x0 %1:_(<2 x s64>) = G_LOAD %0(p0) :: (load (<2 x s64>)) %2:_(s64) = G_VECREDUCE_ADD %1(<2 x s64>) @@ -94,14 +94,14 @@ tracksRegLiveness: true body: | bb.1: liveins: $x0 - ; CHECK-LABEL: name: add_v2s32 ; CHECK: liveins: $x0 - ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>)) - ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[LOAD]](<2 x s32>) - ; CHECK: $w0 = COPY [[VECREDUCE_ADD]](s32) - ; CHECK: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>)) + ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[LOAD]](<2 x s32>) + ; CHECK-NEXT: $w0 = COPY [[VECREDUCE_ADD]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 %0:_(p0) = COPY $x0 %1:_(<2 x s32>) = G_LOAD %0(p0) :: (load (<2 x s32>)) %2:_(s32) = G_VECREDUCE_ADD %1(<2 x s32>) @@ -111,24 +111,25 @@ body: | ... --- name: test_v8i64 +# This is a power-of-2 legalization, so use a tree reduction. alignment: 4 tracksRegLiveness: true body: | bb.1: liveins: $q0, $q1, $q2, $q3 - ; This is a power-of-2 legalization, so use a tree reduction. ; CHECK-LABEL: name: test_v8i64 ; CHECK: liveins: $q0, $q1, $q2, $q3 - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 - ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 - ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2 - ; CHECK: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3 - ; CHECK: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY]], [[COPY1]] - ; CHECK: [[ADD1:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY2]], [[COPY3]] - ; CHECK: [[ADD2:%[0-9]+]]:_(<2 x s64>) = G_ADD [[ADD]], [[ADD1]] - ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[ADD2]](<2 x s64>) - ; CHECK: $x0 = COPY [[VECREDUCE_ADD]](s64) - ; CHECK: RET_ReallyLR implicit $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY2]], [[COPY3]] + ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(<2 x s64>) = G_ADD [[ADD]], [[ADD1]] + ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[ADD2]](<2 x s64>) + ; CHECK-NEXT: $x0 = COPY [[VECREDUCE_ADD]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 %0:_(<2 x s64>) = COPY $q0 %1:_(<2 x s64>) = COPY $q1 %2:_(<2 x s64>) = COPY $q2 @@ -143,25 +144,26 @@ body: | ... --- name: test_v6i64 +# This is a non-power-of-2 legalization, generate multiple vector reductions +# and combine them with scalar ops. alignment: 4 tracksRegLiveness: true body: | bb.1: liveins: $q0, $q1, $q2, $q3 - ; This is a non-power-of-2 legalization, generate multiple vector reductions - ; and combine them with scalar ops. ; CHECK-LABEL: name: test_v6i64 ; CHECK: liveins: $q0, $q1, $q2, $q3 - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 - ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 - ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2 - ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY]](<2 x s64>) - ; CHECK: [[VECREDUCE_ADD1:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY1]](<2 x s64>) - ; CHECK: [[VECREDUCE_ADD2:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY2]](<2 x s64>) - ; CHECK: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[VECREDUCE_ADD]], [[VECREDUCE_ADD1]] - ; CHECK: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[ADD]], [[VECREDUCE_ADD2]] - ; CHECK: $x0 = COPY [[ADD1]](s64) - ; CHECK: RET_ReallyLR implicit $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2 + ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY]](<2 x s64>) + ; CHECK-NEXT: [[VECREDUCE_ADD1:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY1]](<2 x s64>) + ; CHECK-NEXT: [[VECREDUCE_ADD2:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY2]](<2 x s64>) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[VECREDUCE_ADD]], [[VECREDUCE_ADD1]] + ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[ADD]], [[VECREDUCE_ADD2]] + ; CHECK-NEXT: $x0 = COPY [[ADD1]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 %0:_(<2 x s64>) = COPY $q0 %1:_(<2 x s64>) = COPY $q1 %2:_(<2 x s64>) = COPY $q2 diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll index def4192b0e005..aba284b4e0d29 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll @@ -1,8 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=generic | FileCheck %s -check-prefixes=CHECK,SDAG -; RUN: llc < %s -global-isel=1 -global-isel-abort=2 -mtriple=aarch64 -aarch64-neon-syntax=generic 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL +; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=generic | FileCheck %s -check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -mtriple=aarch64 -global-isel=1 -global-isel-abort=2 -aarch64-neon-syntax=generic 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI -; Function Attrs: nounwind readnone declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>) declare i8 @llvm.vector.reduce.add.v3i8(<3 x i8>) declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>) @@ -23,14 +22,14 @@ declare i64 @llvm.vector.reduce.add.v3i64(<3 x i64>) declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) declare i128 @llvm.vector.reduce.add.v2i128(<2 x i128>) -; GISEL: warning: Instruction selection used fallback path for addv_v2i8 -; GISEL-NEXT: warning: Instruction selection used fallback path for addv_v3i8 -; GISEL-NEXT: warning: Instruction selection used fallback path for addv_v4i8 -; GISEL-NEXT: warning: Instruction selection used fallback path for addv_v2i16 -; GISEL-NEXT: warning: Instruction selection used fallback path for addv_v3i16 -; GISEL-NEXT: warning: Instruction selection used fallback path for addv_v3i32 -; GISEL-NEXT: warning: Instruction selection used fallback path for addv_v3i64 -; GISEL-NEXT: warning: Instruction selection used fallback path for addv_v2i128 +; CHECK-GI: warning: Instruction selection used fallback path for addv_v2i8 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v3i8 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v4i8 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v2i16 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v3i16 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v3i32 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v3i64 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v2i128 define i8 @add_B(ptr %arr) { @@ -83,34 +82,34 @@ define i64 @add_D(ptr %arr) { define i32 @oversized_ADDV_256(ptr noalias nocapture readonly %arg1, ptr noalias nocapture readonly %arg2) { -; SDAG-LABEL: oversized_ADDV_256: -; SDAG: // %bb.0: // %entry -; SDAG-NEXT: ldr d0, [x0] -; SDAG-NEXT: ldr d1, [x1] -; SDAG-NEXT: uabdl v0.8h, v0.8b, v1.8b -; SDAG-NEXT: uaddlv s0, v0.8h -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: oversized_ADDV_256: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: ldr d1, [x1] +; CHECK-SD-NEXT: uabdl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: uaddlv s0, v0.8h +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: oversized_ADDV_256: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: ldr d1, [x0] -; GISEL-NEXT: ldr d2, [x1] -; GISEL-NEXT: movi v0.2d, #0000000000000000 -; GISEL-NEXT: usubl v1.8h, v1.8b, v2.8b -; GISEL-NEXT: sshll v2.4s, v1.4h, #0 -; GISEL-NEXT: sshll2 v3.4s, v1.8h, #0 -; GISEL-NEXT: ssubw2 v0.4s, v0.4s, v1.8h -; GISEL-NEXT: cmlt v4.4s, v2.4s, #0 -; GISEL-NEXT: cmlt v5.4s, v3.4s, #0 -; GISEL-NEXT: neg v6.4s, v2.4s -; GISEL-NEXT: mov v1.16b, v4.16b -; GISEL-NEXT: bif v0.16b, v3.16b, v5.16b -; GISEL-NEXT: bsl v1.16b, v6.16b, v2.16b -; GISEL-NEXT: add v0.4s, v1.4s, v0.4s -; GISEL-NEXT: addv s0, v0.4s -; GISEL-NEXT: fmov w0, s0 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: oversized_ADDV_256: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: ldr d2, [x1] +; CHECK-GI-NEXT: movi v0.2d, #0000000000000000 +; CHECK-GI-NEXT: usubl v1.8h, v1.8b, v2.8b +; CHECK-GI-NEXT: sshll v2.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v1.8h, #0 +; CHECK-GI-NEXT: ssubw2 v0.4s, v0.4s, v1.8h +; CHECK-GI-NEXT: cmlt v4.4s, v2.4s, #0 +; CHECK-GI-NEXT: cmlt v5.4s, v3.4s, #0 +; CHECK-GI-NEXT: neg v6.4s, v2.4s +; CHECK-GI-NEXT: mov v1.16b, v4.16b +; CHECK-GI-NEXT: bif v0.16b, v3.16b, v5.16b +; CHECK-GI-NEXT: bsl v1.16b, v6.16b, v2.16b +; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret entry: %0 = load <8 x i8>, ptr %arg1, align 1 %1 = zext <8 x i8> %0 to <8 x i32> @@ -127,48 +126,48 @@ entry: declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) define i32 @oversized_ADDV_512(ptr %arr) { -; SDAG-LABEL: oversized_ADDV_512: -; SDAG: // %bb.0: -; SDAG-NEXT: ldp q0, q1, [x0, #32] -; SDAG-NEXT: ldp q2, q3, [x0] -; SDAG-NEXT: add v1.4s, v3.4s, v1.4s -; SDAG-NEXT: add v0.4s, v2.4s, v0.4s -; SDAG-NEXT: add v0.4s, v0.4s, v1.4s -; SDAG-NEXT: addv s0, v0.4s -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: oversized_ADDV_512: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldp q0, q1, [x0, #32] +; CHECK-SD-NEXT: ldp q2, q3, [x0] +; CHECK-SD-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-SD-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: addv s0, v0.4s +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: oversized_ADDV_512: -; GISEL: // %bb.0: -; GISEL-NEXT: ldp q0, q1, [x0] -; GISEL-NEXT: ldp q2, q3, [x0, #32] -; GISEL-NEXT: add v0.4s, v0.4s, v1.4s -; GISEL-NEXT: add v1.4s, v2.4s, v3.4s -; GISEL-NEXT: add v0.4s, v0.4s, v1.4s -; GISEL-NEXT: addv s0, v0.4s -; GISEL-NEXT: fmov w0, s0 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: oversized_ADDV_512: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldp q0, q1, [x0] +; CHECK-GI-NEXT: ldp q2, q3, [x0, #32] +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret %bin.rdx = load <16 x i32>, ptr %arr %r = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %bin.rdx) ret i32 %r } define i8 @addv_combine_i8(<8 x i8> %a1, <8 x i8> %a2) { -; SDAG-LABEL: addv_combine_i8: -; SDAG: // %bb.0: // %entry -; SDAG-NEXT: add v0.8b, v0.8b, v1.8b -; SDAG-NEXT: addv b0, v0.8b -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: addv_combine_i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: add v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: addv b0, v0.8b +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: addv_combine_i8: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: addv b0, v0.8b -; GISEL-NEXT: addv b1, v1.8b -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 -; GISEL-NEXT: add w0, w9, w8, uxtb -; GISEL-NEXT: ret +; CHECK-GI-LABEL: addv_combine_i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: addv b0, v0.8b +; CHECK-GI-NEXT: addv b1, v1.8b +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: add w0, w9, w8, uxtb +; CHECK-GI-NEXT: ret entry: %rdx.1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a1) %rdx.2 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a2) @@ -177,21 +176,21 @@ entry: } define i16 @addv_combine_i16(<4 x i16> %a1, <4 x i16> %a2) { -; SDAG-LABEL: addv_combine_i16: -; SDAG: // %bb.0: // %entry -; SDAG-NEXT: add v0.4h, v0.4h, v1.4h -; SDAG-NEXT: addv h0, v0.4h -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: addv_combine_i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: addv h0, v0.4h +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: addv_combine_i16: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: addv h0, v0.4h -; GISEL-NEXT: addv h1, v1.4h -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 -; GISEL-NEXT: add w0, w9, w8, uxth -; GISEL-NEXT: ret +; CHECK-GI-LABEL: addv_combine_i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: addv h0, v0.4h +; CHECK-GI-NEXT: addv h1, v1.4h +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: add w0, w9, w8, uxth +; CHECK-GI-NEXT: ret entry: %rdx.1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a1) %rdx.2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2) @@ -200,21 +199,21 @@ entry: } define i32 @addv_combine_i32(<4 x i32> %a1, <4 x i32> %a2) { -; SDAG-LABEL: addv_combine_i32: -; SDAG: // %bb.0: // %entry -; SDAG-NEXT: add v0.4s, v0.4s, v1.4s -; SDAG-NEXT: addv s0, v0.4s -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: addv_combine_i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: addv s0, v0.4s +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: addv_combine_i32: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: addv s0, v0.4s -; GISEL-NEXT: addv s1, v1.4s -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 -; GISEL-NEXT: add w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: addv_combine_i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: addv s1, v1.4s +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: add w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %rdx.1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a1) %rdx.2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2) @@ -223,21 +222,21 @@ entry: } define i64 @addv_combine_i64(<2 x i64> %a1, <2 x i64> %a2) { -; SDAG-LABEL: addv_combine_i64: -; SDAG: // %bb.0: // %entry -; SDAG-NEXT: add v0.2d, v0.2d, v1.2d -; SDAG-NEXT: addp d0, v0.2d -; SDAG-NEXT: fmov x0, d0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: addv_combine_i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: addp d0, v0.2d +; CHECK-SD-NEXT: fmov x0, d0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: addv_combine_i64: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: addp d0, v0.2d -; GISEL-NEXT: addp d1, v1.2d -; GISEL-NEXT: fmov x8, d0 -; GISEL-NEXT: fmov x9, d1 -; GISEL-NEXT: add x0, x8, x9 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: addv_combine_i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: addp d0, v0.2d +; CHECK-GI-NEXT: addp d1, v1.2d +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: add x0, x8, x9 +; CHECK-GI-NEXT: ret entry: %rdx.1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1) %rdx.2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a2) @@ -471,3 +470,6 @@ entry: ret i128 %arg1 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GISEL: {{.*}} +; SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll b/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll index b498611242d46..d69d1b6eb4a2a 100644 --- a/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefixes=CHECK,SDAG -; RUN: llc < %s -global-isel -global-isel-abort=1 -pass-remarks-missed=gisel* -mtriple=arm64-linux-gnu 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL,FALLBACK +; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -mtriple=arm64-linux-gnu -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI %0 = type { i64, i64 } @@ -39,22 +39,21 @@ declare i32 @llvm.aarch64.stxp(i64, i64, ptr) nounwind @var = dso_local global i64 0, align 8 -; FALLBACK-NOT: remark:{{.*}}test_load_i8 define dso_local void @test_load_i8(ptr %addr) { -; SDAG-LABEL: test_load_i8: -; SDAG: // %bb.0: -; SDAG-NEXT: ldxrb w8, [x0] -; SDAG-NEXT: adrp x9, var -; SDAG-NEXT: str x8, [x9, :lo12:var] -; SDAG-NEXT: ret +; CHECK-SD-LABEL: test_load_i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldxrb w8, [x0] +; CHECK-SD-NEXT: adrp x9, var +; CHECK-SD-NEXT: str x8, [x9, :lo12:var] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: test_load_i8: -; GISEL: // %bb.0: -; GISEL-NEXT: ldxrb w9, [x0] -; GISEL-NEXT: adrp x8, var -; GISEL-NEXT: and x9, x9, #0xff -; GISEL-NEXT: str x9, [x8, :lo12:var] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test_load_i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldxrb w9, [x0] +; CHECK-GI-NEXT: adrp x8, var +; CHECK-GI-NEXT: and x9, x9, #0xff +; CHECK-GI-NEXT: str x9, [x8, :lo12:var] +; CHECK-GI-NEXT: ret %val = call i64 @llvm.aarch64.ldxr.p0(ptr elementtype(i8) %addr) %shortval = trunc i64 %val to i8 @@ -63,22 +62,21 @@ define dso_local void @test_load_i8(ptr %addr) { ret void } -; FALLBACK-NOT: remark:{{.*}}test_load_i16 define dso_local void @test_load_i16(ptr %addr) { -; SDAG-LABEL: test_load_i16: -; SDAG: // %bb.0: -; SDAG-NEXT: ldxrh w8, [x0] -; SDAG-NEXT: adrp x9, var -; SDAG-NEXT: str x8, [x9, :lo12:var] -; SDAG-NEXT: ret +; CHECK-SD-LABEL: test_load_i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldxrh w8, [x0] +; CHECK-SD-NEXT: adrp x9, var +; CHECK-SD-NEXT: str x8, [x9, :lo12:var] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: test_load_i16: -; GISEL: // %bb.0: -; GISEL-NEXT: ldxrh w9, [x0] -; GISEL-NEXT: adrp x8, var -; GISEL-NEXT: and x9, x9, #0xffff -; GISEL-NEXT: str x9, [x8, :lo12:var] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test_load_i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldxrh w9, [x0] +; CHECK-GI-NEXT: adrp x8, var +; CHECK-GI-NEXT: and x9, x9, #0xffff +; CHECK-GI-NEXT: str x9, [x8, :lo12:var] +; CHECK-GI-NEXT: ret %val = call i64 @llvm.aarch64.ldxr.p0(ptr elementtype(i16) %addr) %shortval = trunc i64 %val to i16 @@ -87,22 +85,21 @@ define dso_local void @test_load_i16(ptr %addr) { ret void } -; FALLBACK-NOT: remark:{{.*}}test_load_i32 define dso_local void @test_load_i32(ptr %addr) { -; SDAG-LABEL: test_load_i32: -; SDAG: // %bb.0: -; SDAG-NEXT: ldxr w8, [x0] -; SDAG-NEXT: adrp x9, var -; SDAG-NEXT: str x8, [x9, :lo12:var] -; SDAG-NEXT: ret +; CHECK-SD-LABEL: test_load_i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldxr w8, [x0] +; CHECK-SD-NEXT: adrp x9, var +; CHECK-SD-NEXT: str x8, [x9, :lo12:var] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: test_load_i32: -; GISEL: // %bb.0: -; GISEL-NEXT: ldxr w9, [x0] -; GISEL-NEXT: adrp x8, var -; GISEL-NEXT: mov w9, w9 -; GISEL-NEXT: str x9, [x8, :lo12:var] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test_load_i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldxr w9, [x0] +; CHECK-GI-NEXT: adrp x8, var +; CHECK-GI-NEXT: mov w9, w9 +; CHECK-GI-NEXT: str x9, [x8, :lo12:var] +; CHECK-GI-NEXT: ret %val = call i64 @llvm.aarch64.ldxr.p0(ptr elementtype(i32) %addr) %shortval = trunc i64 %val to i32 @@ -111,7 +108,6 @@ define dso_local void @test_load_i32(ptr %addr) { ret void } -; FALLBACK-NOT: remark:{{.*}}test_load_i64 define dso_local void @test_load_i64(ptr %addr) { ; CHECK-LABEL: test_load_i64: ; CHECK: // %bb.0: @@ -128,7 +124,6 @@ define dso_local void @test_load_i64(ptr %addr) { declare i64 @llvm.aarch64.ldxr.p0(ptr) nounwind -; FALLBACK-NOT: remark:{{.*}}test_store_i8 define dso_local i32 @test_store_i8(i32, i8 %val, ptr %addr) { ; CHECK-LABEL: test_store_i8: ; CHECK: // %bb.0: @@ -140,7 +135,6 @@ define dso_local i32 @test_store_i8(i32, i8 %val, ptr %addr) { ret i32 %res } -; FALLBACK-NOT: remark:{{.*}}test_store_i16 define dso_local i32 @test_store_i16(i32, i16 %val, ptr %addr) { ; CHECK-LABEL: test_store_i16: ; CHECK: // %bb.0: @@ -152,7 +146,6 @@ define dso_local i32 @test_store_i16(i32, i16 %val, ptr %addr) { ret i32 %res } -; FALLBACK-NOT: remark:{{.*}}test_store_i32 define dso_local i32 @test_store_i32(i32, i32 %val, ptr %addr) { ; CHECK-LABEL: test_store_i32: ; CHECK: // %bb.0: @@ -163,7 +156,6 @@ define dso_local i32 @test_store_i32(i32, i32 %val, ptr %addr) { ret i32 %res } -; FALLBACK-NOT: remark:{{.*}}test_store_i64 define dso_local i32 @test_store_i64(i32, i64 %val, ptr %addr) { ; CHECK-LABEL: test_store_i64: ; CHECK: // %bb.0: @@ -219,22 +211,21 @@ entry: declare %0 @llvm.aarch64.ldaxp(ptr) nounwind declare i32 @llvm.aarch64.stlxp(i64, i64, ptr) nounwind -; FALLBACK-NOT: remark:{{.*}}test_load_acquire_i8 define dso_local void @test_load_acquire_i8(ptr %addr) { -; SDAG-LABEL: test_load_acquire_i8: -; SDAG: // %bb.0: -; SDAG-NEXT: ldaxrb w8, [x0] -; SDAG-NEXT: adrp x9, var -; SDAG-NEXT: str x8, [x9, :lo12:var] -; SDAG-NEXT: ret +; CHECK-SD-LABEL: test_load_acquire_i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldaxrb w8, [x0] +; CHECK-SD-NEXT: adrp x9, var +; CHECK-SD-NEXT: str x8, [x9, :lo12:var] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: test_load_acquire_i8: -; GISEL: // %bb.0: -; GISEL-NEXT: ldaxrb w9, [x0] -; GISEL-NEXT: adrp x8, var -; GISEL-NEXT: and x9, x9, #0xff -; GISEL-NEXT: str x9, [x8, :lo12:var] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test_load_acquire_i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldaxrb w9, [x0] +; CHECK-GI-NEXT: adrp x8, var +; CHECK-GI-NEXT: and x9, x9, #0xff +; CHECK-GI-NEXT: str x9, [x8, :lo12:var] +; CHECK-GI-NEXT: ret %val = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i8) %addr) %shortval = trunc i64 %val to i8 @@ -243,22 +234,21 @@ define dso_local void @test_load_acquire_i8(ptr %addr) { ret void } -; FALLBACK-NOT: remark:{{.*}}test_load_acquire_i16 define dso_local void @test_load_acquire_i16(ptr %addr) { -; SDAG-LABEL: test_load_acquire_i16: -; SDAG: // %bb.0: -; SDAG-NEXT: ldaxrh w8, [x0] -; SDAG-NEXT: adrp x9, var -; SDAG-NEXT: str x8, [x9, :lo12:var] -; SDAG-NEXT: ret +; CHECK-SD-LABEL: test_load_acquire_i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldaxrh w8, [x0] +; CHECK-SD-NEXT: adrp x9, var +; CHECK-SD-NEXT: str x8, [x9, :lo12:var] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: test_load_acquire_i16: -; GISEL: // %bb.0: -; GISEL-NEXT: ldaxrh w9, [x0] -; GISEL-NEXT: adrp x8, var -; GISEL-NEXT: and x9, x9, #0xffff -; GISEL-NEXT: str x9, [x8, :lo12:var] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test_load_acquire_i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldaxrh w9, [x0] +; CHECK-GI-NEXT: adrp x8, var +; CHECK-GI-NEXT: and x9, x9, #0xffff +; CHECK-GI-NEXT: str x9, [x8, :lo12:var] +; CHECK-GI-NEXT: ret %val = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i16) %addr) %shortval = trunc i64 %val to i16 @@ -267,22 +257,21 @@ define dso_local void @test_load_acquire_i16(ptr %addr) { ret void } -; FALLBACK-NOT: remark:{{.*}}test_load_acquire_i32 define dso_local void @test_load_acquire_i32(ptr %addr) { -; SDAG-LABEL: test_load_acquire_i32: -; SDAG: // %bb.0: -; SDAG-NEXT: ldaxr w8, [x0] -; SDAG-NEXT: adrp x9, var -; SDAG-NEXT: str x8, [x9, :lo12:var] -; SDAG-NEXT: ret +; CHECK-SD-LABEL: test_load_acquire_i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldaxr w8, [x0] +; CHECK-SD-NEXT: adrp x9, var +; CHECK-SD-NEXT: str x8, [x9, :lo12:var] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: test_load_acquire_i32: -; GISEL: // %bb.0: -; GISEL-NEXT: ldaxr w9, [x0] -; GISEL-NEXT: adrp x8, var -; GISEL-NEXT: mov w9, w9 -; GISEL-NEXT: str x9, [x8, :lo12:var] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test_load_acquire_i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldaxr w9, [x0] +; CHECK-GI-NEXT: adrp x8, var +; CHECK-GI-NEXT: mov w9, w9 +; CHECK-GI-NEXT: str x9, [x8, :lo12:var] +; CHECK-GI-NEXT: ret %val = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i32) %addr) %shortval = trunc i64 %val to i32 @@ -291,7 +280,6 @@ define dso_local void @test_load_acquire_i32(ptr %addr) { ret void } -; FALLBACK-NOT: remark:{{.*}}test_load_acquire_i64 define dso_local void @test_load_acquire_i64(ptr %addr) { ; CHECK-LABEL: test_load_acquire_i64: ; CHECK: // %bb.0: @@ -308,7 +296,6 @@ define dso_local void @test_load_acquire_i64(ptr %addr) { declare i64 @llvm.aarch64.ldaxr.p0(ptr) nounwind -; FALLBACK-NOT: remark:{{.*}}test_store_release_i8 define dso_local i32 @test_store_release_i8(i32, i8 %val, ptr %addr) { ; CHECK-LABEL: test_store_release_i8: ; CHECK: // %bb.0: @@ -320,7 +307,6 @@ define dso_local i32 @test_store_release_i8(i32, i8 %val, ptr %addr) { ret i32 %res } -; FALLBACK-NOT: remark:{{.*}}test_store_release_i16 define dso_local i32 @test_store_release_i16(i32, i16 %val, ptr %addr) { ; CHECK-LABEL: test_store_release_i16: ; CHECK: // %bb.0: @@ -332,7 +318,6 @@ define dso_local i32 @test_store_release_i16(i32, i16 %val, ptr %addr) { ret i32 %res } -; FALLBACK-NOT: remark:{{.*}}test_store_release_i32 define dso_local i32 @test_store_release_i32(i32, i32 %val, ptr %addr) { ; CHECK-LABEL: test_store_release_i32: ; CHECK: // %bb.0: @@ -343,7 +328,6 @@ define dso_local i32 @test_store_release_i32(i32, i32 %val, ptr %addr) { ret i32 %res } -; FALLBACK-NOT: remark:{{.*}}test_store_release_i64 define dso_local i32 @test_store_release_i64(i32, i64 %val, ptr %addr) { ; CHECK-LABEL: test_store_release_i64: ; CHECK: // %bb.0: @@ -378,5 +362,3 @@ define dso_local i32 @test_stxp_undef_inline_asm(ptr %p, i64 %x) nounwind { } declare i32 @llvm.aarch64.stlxr.p0(i64, ptr) nounwind -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; FALLBACK: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll index 7f3c1fdc93380..c9fe258f11556 100644 --- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll +++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll @@ -1,86 +1,87 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,SDAG -; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -global-isel -global-isel-abort=2 -verify-machineinstrs < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL +; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -global-isel -global-isel-abort=2 -verify-machineinstrs < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI ; Basic tests from input vector to bitmask ; IR generated from clang for: ; __builtin_convertvector + reinterpret_cast -; GISEL: warning: Instruction selection used fallback path for clang_builtins_undef_concat_convert_to_bitmask4 -; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_2xi32 -; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_8xi2 -; GISEL-NEXT: warning: Instruction selection used fallback path for no_direct_convert_for_bad_concat +; CHECK-GI: warning: Instruction selection used fallback path for convert_to_bitmask2 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for clang_builtins_undef_concat_convert_to_bitmask4 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_2xi32 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_8xi2 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for no_direct_convert_for_bad_concat define i16 @convert_to_bitmask16(<16 x i8> %vec) { ; Bits used in mask -; SDAG-LABEL: convert_to_bitmask16: -; SDAG: ; %bb.0: -; SDAG-NEXT: adrp x8, lCPI0_0@PAGE -; SDAG-NEXT: cmeq.16b v0, v0, #0 -; SDAG-NEXT: ldr q1, [x8, lCPI0_0@PAGEOFF] -; SDAG-NEXT: bic.16b v0, v1, v0 -; SDAG-NEXT: ext.16b v1, v0, v0, #8 -; SDAG-NEXT: zip1.16b v0, v0, v1 -; SDAG-NEXT: addv.8h h0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask16: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: adrp x8, lCPI0_0@PAGE +; CHECK-SD-NEXT: cmeq.16b v0, v0, #0 +; CHECK-SD-NEXT: ldr q1, [x8, lCPI0_0@PAGEOFF] +; CHECK-SD-NEXT: bic.16b v0, v1, v0 +; CHECK-SD-NEXT: ext.16b v1, v0, v0, #8 +; CHECK-SD-NEXT: zip1.16b v0, v0, v1 +; CHECK-SD-NEXT: addv.8h h0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask16: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: cmeq.16b v0, v0, #0 -; GISEL-NEXT: mvn.16b v0, v0 -; GISEL-NEXT: umov.b w8, v0[1] -; GISEL-NEXT: umov.b w9, v0[0] -; GISEL-NEXT: umov.b w10, v0[2] -; GISEL-NEXT: umov.b w11, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w9, w8, #1, #31 -; GISEL-NEXT: and w8, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[4] -; GISEL-NEXT: orr w8, w9, w8, lsl #2 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[5] -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[6] -; GISEL-NEXT: orr w8, w8, w9, lsl #4 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[7] -; GISEL-NEXT: orr w8, w8, w9, lsl #5 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[8] -; GISEL-NEXT: orr w8, w8, w9, lsl #6 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[9] -; GISEL-NEXT: orr w8, w8, w9, lsl #7 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[10] -; GISEL-NEXT: orr w8, w8, w9, lsl #8 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[11] -; GISEL-NEXT: orr w8, w8, w9, lsl #9 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[12] -; GISEL-NEXT: orr w8, w8, w9, lsl #10 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[13] -; GISEL-NEXT: orr w8, w8, w9, lsl #11 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[14] -; GISEL-NEXT: orr w8, w8, w9, lsl #12 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[15] -; GISEL-NEXT: orr w8, w8, w9, lsl #13 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w8, w9, lsl #14 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: orr w8, w8, w9, lsl #15 -; GISEL-NEXT: strh w8, [sp, #14] -; GISEL-NEXT: and w0, w8, #0xffff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask16: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: cmeq.16b v0, v0, #0 +; CHECK-GI-NEXT: mvn.16b v0, v0 +; CHECK-GI-NEXT: umov.b w8, v0[1] +; CHECK-GI-NEXT: umov.b w9, v0[0] +; CHECK-GI-NEXT: umov.b w10, v0[2] +; CHECK-GI-NEXT: umov.b w11, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w9, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[4] +; CHECK-GI-NEXT: orr w8, w9, w8, lsl #2 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[5] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[6] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #4 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[7] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #5 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[8] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #6 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[9] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #7 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[10] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #8 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[11] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #9 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[12] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #10 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[13] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #11 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[14] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #12 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[15] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #13 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #14 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #15 +; CHECK-GI-NEXT: strh w8, [sp, #14] +; CHECK-GI-NEXT: and w0, w8, #0xffff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret ; Actual conversion @@ -90,50 +91,50 @@ define i16 @convert_to_bitmask16(<16 x i8> %vec) { } define i16 @convert_to_bitmask8(<8 x i16> %vec) { -; SDAG-LABEL: convert_to_bitmask8: -; SDAG: ; %bb.0: -; SDAG-NEXT: adrp x8, lCPI1_0@PAGE -; SDAG-NEXT: cmeq.8h v0, v0, #0 -; SDAG-NEXT: ldr q1, [x8, lCPI1_0@PAGEOFF] -; SDAG-NEXT: bic.16b v0, v1, v0 -; SDAG-NEXT: addv.8h h0, v0 -; SDAG-NEXT: fmov w8, s0 -; SDAG-NEXT: and w0, w8, #0xff -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask8: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: adrp x8, lCPI1_0@PAGE +; CHECK-SD-NEXT: cmeq.8h v0, v0, #0 +; CHECK-SD-NEXT: ldr q1, [x8, lCPI1_0@PAGEOFF] +; CHECK-SD-NEXT: bic.16b v0, v1, v0 +; CHECK-SD-NEXT: addv.8h h0, v0 +; CHECK-SD-NEXT: fmov w8, s0 +; CHECK-SD-NEXT: and w0, w8, #0xff +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask8: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: cmeq.8h v0, v0, #0 -; GISEL-NEXT: mvn.16b v0, v0 -; GISEL-NEXT: xtn.8b v0, v0 -; GISEL-NEXT: umov.b w8, v0[1] -; GISEL-NEXT: umov.b w9, v0[0] -; GISEL-NEXT: umov.b w10, v0[2] -; GISEL-NEXT: umov.b w11, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w9, w8, #1, #31 -; GISEL-NEXT: and w8, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[4] -; GISEL-NEXT: orr w8, w9, w8, lsl #2 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[5] -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[6] -; GISEL-NEXT: orr w8, w8, w9, lsl #4 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[7] -; GISEL-NEXT: orr w8, w8, w9, lsl #5 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w8, w9, lsl #6 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: orr w8, w8, w9, lsl #7 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask8: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: cmeq.8h v0, v0, #0 +; CHECK-GI-NEXT: mvn.16b v0, v0 +; CHECK-GI-NEXT: xtn.8b v0, v0 +; CHECK-GI-NEXT: umov.b w8, v0[1] +; CHECK-GI-NEXT: umov.b w9, v0[0] +; CHECK-GI-NEXT: umov.b w10, v0[2] +; CHECK-GI-NEXT: umov.b w11, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w9, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[4] +; CHECK-GI-NEXT: orr w8, w9, w8, lsl #2 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[5] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[6] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #4 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[7] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #5 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #6 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #7 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer @@ -143,36 +144,36 @@ define i16 @convert_to_bitmask8(<8 x i16> %vec) { } define i4 @convert_to_bitmask4(<4 x i32> %vec) { -; SDAG-LABEL: convert_to_bitmask4: -; SDAG: ; %bb.0: -; SDAG-NEXT: adrp x8, lCPI2_0@PAGE -; SDAG-NEXT: cmeq.4s v0, v0, #0 -; SDAG-NEXT: ldr q1, [x8, lCPI2_0@PAGEOFF] -; SDAG-NEXT: bic.16b v0, v1, v0 -; SDAG-NEXT: addv.4s s0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask4: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: adrp x8, lCPI2_0@PAGE +; CHECK-SD-NEXT: cmeq.4s v0, v0, #0 +; CHECK-SD-NEXT: ldr q1, [x8, lCPI2_0@PAGEOFF] +; CHECK-SD-NEXT: bic.16b v0, v1, v0 +; CHECK-SD-NEXT: addv.4s s0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask4: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: cmeq.4s v0, v0, #0 -; GISEL-NEXT: mvn.16b v0, v0 -; GISEL-NEXT: mov.s w8, v0[1] -; GISEL-NEXT: mov.s w9, v0[2] -; GISEL-NEXT: fmov w11, s0 -; GISEL-NEXT: mov.s w10, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w11, w8, #1, #31 -; GISEL-NEXT: and w8, w9, #0x1 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w11, w8, lsl #2 -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask4: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: cmeq.4s v0, v0, #0 +; CHECK-GI-NEXT: mvn.16b v0, v0 +; CHECK-GI-NEXT: mov.s w8, v0[1] +; CHECK-GI-NEXT: mov.s w9, v0[2] +; CHECK-GI-NEXT: fmov w11, s0 +; CHECK-GI-NEXT: mov.s w10, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer @@ -220,37 +221,37 @@ define i8 @clang_builtins_undef_concat_convert_to_bitmask4(<4 x i32> %vec) { define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) { -; SDAG-LABEL: convert_to_bitmask_no_compare: -; SDAG: ; %bb.0: -; SDAG-NEXT: and.16b v0, v0, v1 -; SDAG-NEXT: adrp x8, lCPI5_0@PAGE -; SDAG-NEXT: ldr q1, [x8, lCPI5_0@PAGEOFF] -; SDAG-NEXT: shl.4s v0, v0, #31 -; SDAG-NEXT: cmlt.4s v0, v0, #0 -; SDAG-NEXT: and.16b v0, v0, v1 -; SDAG-NEXT: addv.4s s0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask_no_compare: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: and.16b v0, v0, v1 +; CHECK-SD-NEXT: adrp x8, lCPI5_0@PAGE +; CHECK-SD-NEXT: ldr q1, [x8, lCPI5_0@PAGEOFF] +; CHECK-SD-NEXT: shl.4s v0, v0, #31 +; CHECK-SD-NEXT: cmlt.4s v0, v0, #0 +; CHECK-SD-NEXT: and.16b v0, v0, v1 +; CHECK-SD-NEXT: addv.4s s0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask_no_compare: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: and.16b v0, v0, v1 -; GISEL-NEXT: mov.s w8, v0[1] -; GISEL-NEXT: mov.s w9, v0[2] -; GISEL-NEXT: fmov w11, s0 -; GISEL-NEXT: mov.s w10, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w11, w8, #1, #31 -; GISEL-NEXT: and w8, w9, #0x1 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w11, w8, lsl #2 -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask_no_compare: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: and.16b v0, v0, v1 +; CHECK-GI-NEXT: mov.s w8, v0[1] +; CHECK-GI-NEXT: mov.s w9, v0[2] +; CHECK-GI-NEXT: fmov w11, s0 +; CHECK-GI-NEXT: mov.s w10, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp = and <4 x i32> %vec1, %vec2 @@ -260,39 +261,39 @@ define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) { } define i4 @convert_to_bitmask_with_compare_chain(<4 x i32> %vec1, <4 x i32> %vec2) { -; SDAG-LABEL: convert_to_bitmask_with_compare_chain: -; SDAG: ; %bb.0: -; SDAG-NEXT: cmeq.4s v2, v0, #0 -; SDAG-NEXT: cmeq.4s v0, v0, v1 -; SDAG-NEXT: adrp x8, lCPI6_0@PAGE -; SDAG-NEXT: ldr q1, [x8, lCPI6_0@PAGEOFF] -; SDAG-NEXT: bic.16b v0, v0, v2 -; SDAG-NEXT: and.16b v0, v0, v1 -; SDAG-NEXT: addv.4s s0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask_with_compare_chain: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: cmeq.4s v2, v0, #0 +; CHECK-SD-NEXT: cmeq.4s v0, v0, v1 +; CHECK-SD-NEXT: adrp x8, lCPI6_0@PAGE +; CHECK-SD-NEXT: ldr q1, [x8, lCPI6_0@PAGEOFF] +; CHECK-SD-NEXT: bic.16b v0, v0, v2 +; CHECK-SD-NEXT: and.16b v0, v0, v1 +; CHECK-SD-NEXT: addv.4s s0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask_with_compare_chain: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: cmeq.4s v2, v0, #0 -; GISEL-NEXT: cmeq.4s v0, v0, v1 -; GISEL-NEXT: bic.16b v0, v0, v2 -; GISEL-NEXT: mov.s w8, v0[1] -; GISEL-NEXT: mov.s w9, v0[2] -; GISEL-NEXT: fmov w11, s0 -; GISEL-NEXT: mov.s w10, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w11, w8, #1, #31 -; GISEL-NEXT: and w8, w9, #0x1 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w11, w8, lsl #2 -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask_with_compare_chain: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: cmeq.4s v2, v0, #0 +; CHECK-GI-NEXT: cmeq.4s v0, v0, v1 +; CHECK-GI-NEXT: bic.16b v0, v0, v2 +; CHECK-GI-NEXT: mov.s w8, v0[1] +; CHECK-GI-NEXT: mov.s w9, v0[2] +; CHECK-GI-NEXT: fmov w11, s0 +; CHECK-GI-NEXT: mov.s w10, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer @@ -303,39 +304,39 @@ define i4 @convert_to_bitmask_with_compare_chain(<4 x i32> %vec1, <4 x i32> %vec } define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %vec2) { -; SDAG-LABEL: convert_to_bitmask_with_trunc_in_chain: -; SDAG: ; %bb.0: -; SDAG-NEXT: cmeq.4s v0, v0, #0 -; SDAG-NEXT: adrp x8, lCPI7_0@PAGE -; SDAG-NEXT: bic.16b v0, v1, v0 -; SDAG-NEXT: ldr q1, [x8, lCPI7_0@PAGEOFF] -; SDAG-NEXT: shl.4s v0, v0, #31 -; SDAG-NEXT: cmlt.4s v0, v0, #0 -; SDAG-NEXT: and.16b v0, v0, v1 -; SDAG-NEXT: addv.4s s0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask_with_trunc_in_chain: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: cmeq.4s v0, v0, #0 +; CHECK-SD-NEXT: adrp x8, lCPI7_0@PAGE +; CHECK-SD-NEXT: bic.16b v0, v1, v0 +; CHECK-SD-NEXT: ldr q1, [x8, lCPI7_0@PAGEOFF] +; CHECK-SD-NEXT: shl.4s v0, v0, #31 +; CHECK-SD-NEXT: cmlt.4s v0, v0, #0 +; CHECK-SD-NEXT: and.16b v0, v0, v1 +; CHECK-SD-NEXT: addv.4s s0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask_with_trunc_in_chain: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: cmeq.4s v0, v0, #0 -; GISEL-NEXT: bic.16b v0, v1, v0 -; GISEL-NEXT: mov.s w8, v0[1] -; GISEL-NEXT: mov.s w9, v0[2] -; GISEL-NEXT: fmov w11, s0 -; GISEL-NEXT: mov.s w10, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w11, w8, #1, #31 -; GISEL-NEXT: and w8, w9, #0x1 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w11, w8, lsl #2 -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask_with_trunc_in_chain: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: cmeq.4s v0, v0, #0 +; CHECK-GI-NEXT: bic.16b v0, v1, v0 +; CHECK-GI-NEXT: mov.s w8, v0[1] +; CHECK-GI-NEXT: mov.s w9, v0[2] +; CHECK-GI-NEXT: fmov w11, s0 +; CHECK-GI-NEXT: mov.s w10, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer @@ -346,82 +347,82 @@ define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %ve } define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <4 x i32> %vec2) { -; SDAG-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain: -; SDAG: ; %bb.0: -; SDAG-NEXT: cmeq.4s v0, v0, #0 -; SDAG-NEXT: cmeq.4s v1, v1, #0 -; SDAG-NEXT: adrp x8, lCPI8_0@PAGE -; SDAG-NEXT: movi d2, #0x000000ffffffff -; SDAG-NEXT: movi d3, #0x00ffffffffffff -; SDAG-NEXT: bic.16b v0, v1, v0 -; SDAG-NEXT: movi d1, #0xffff0000ffff0000 -; SDAG-NEXT: xtn.4h v0, v0 -; SDAG-NEXT: orr.8b v0, v0, v2 -; SDAG-NEXT: movi d2, #0x00ffffffff0000 -; SDAG-NEXT: eor.8b v1, v0, v1 -; SDAG-NEXT: eor.8b v0, v0, v2 -; SDAG-NEXT: mov.h v1[2], wzr -; SDAG-NEXT: orr.8b v0, v0, v3 -; SDAG-NEXT: orr.8b v0, v1, v0 -; SDAG-NEXT: ldr d1, [x8, lCPI8_0@PAGEOFF] -; SDAG-NEXT: shl.4h v0, v0, #15 -; SDAG-NEXT: cmlt.4h v0, v0, #0 -; SDAG-NEXT: and.8b v0, v0, v1 -; SDAG-NEXT: addv.4h h0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: cmeq.4s v0, v0, #0 +; CHECK-SD-NEXT: cmeq.4s v1, v1, #0 +; CHECK-SD-NEXT: adrp x8, lCPI8_0@PAGE +; CHECK-SD-NEXT: movi d2, #0x000000ffffffff +; CHECK-SD-NEXT: movi d3, #0x00ffffffffffff +; CHECK-SD-NEXT: bic.16b v0, v1, v0 +; CHECK-SD-NEXT: movi d1, #0xffff0000ffff0000 +; CHECK-SD-NEXT: xtn.4h v0, v0 +; CHECK-SD-NEXT: orr.8b v0, v0, v2 +; CHECK-SD-NEXT: movi d2, #0x00ffffffff0000 +; CHECK-SD-NEXT: eor.8b v1, v0, v1 +; CHECK-SD-NEXT: eor.8b v0, v0, v2 +; CHECK-SD-NEXT: mov.h v1[2], wzr +; CHECK-SD-NEXT: orr.8b v0, v0, v3 +; CHECK-SD-NEXT: orr.8b v0, v1, v0 +; CHECK-SD-NEXT: ldr d1, [x8, lCPI8_0@PAGEOFF] +; CHECK-SD-NEXT: shl.4h v0, v0, #15 +; CHECK-SD-NEXT: cmlt.4h v0, v0, #0 +; CHECK-SD-NEXT: and.8b v0, v0, v1 +; CHECK-SD-NEXT: addv.4h h0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: mov w8, #1 ; =0x1 -; GISEL-NEXT: mov w9, #0 ; =0x0 -; GISEL-NEXT: cmeq.4s v5, v0, #0 -; GISEL-NEXT: fmov s2, w8 -; GISEL-NEXT: fmov s4, w9 -; GISEL-NEXT: cmeq.4s v1, v1, #0 -; GISEL-NEXT: mov.16b v3, v2 -; GISEL-NEXT: mov.16b v0, v4 -; GISEL-NEXT: mov.h v4[1], w8 -; GISEL-NEXT: bic.16b v1, v1, v5 -; GISEL-NEXT: mov.16b v5, v2 -; GISEL-NEXT: mov.h v2[1], w8 -; GISEL-NEXT: mov.h v3[1], w8 -; GISEL-NEXT: mov.h v0[1], w8 -; GISEL-NEXT: mov.h v5[1], w8 -; GISEL-NEXT: mov.h v4[2], w8 -; GISEL-NEXT: xtn.4h v1, v1 -; GISEL-NEXT: mov.h v2[2], w8 -; GISEL-NEXT: mov.h v3[2], w9 -; GISEL-NEXT: mov.h v0[2], w9 -; GISEL-NEXT: mov.h v5[2], w9 -; GISEL-NEXT: mov.h v4[3], w9 -; GISEL-NEXT: mov.h v2[3], w9 -; GISEL-NEXT: mov.h v3[3], w9 -; GISEL-NEXT: mov.h v0[3], w8 -; GISEL-NEXT: mov.h v5[3], w8 -; GISEL-NEXT: orr.8b v1, v1, v3 -; GISEL-NEXT: eor.8b v0, v1, v0 -; GISEL-NEXT: eor.8b v1, v4, v1 -; GISEL-NEXT: and.8b v0, v0, v5 -; GISEL-NEXT: orr.8b v1, v2, v1 -; GISEL-NEXT: orr.8b v0, v0, v1 -; GISEL-NEXT: ushll.4s v0, v0, #0 -; GISEL-NEXT: mov.s w8, v0[1] -; GISEL-NEXT: mov.s w9, v0[2] -; GISEL-NEXT: fmov w11, s0 -; GISEL-NEXT: mov.s w10, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w11, w8, #1, #31 -; GISEL-NEXT: and w8, w9, #0x1 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w11, w8, lsl #2 -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov w8, #1 ; =0x1 +; CHECK-GI-NEXT: mov w9, #0 ; =0x0 +; CHECK-GI-NEXT: cmeq.4s v5, v0, #0 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: cmeq.4s v1, v1, #0 +; CHECK-GI-NEXT: mov.16b v3, v2 +; CHECK-GI-NEXT: mov.16b v0, v4 +; CHECK-GI-NEXT: mov.h v4[1], w8 +; CHECK-GI-NEXT: bic.16b v1, v1, v5 +; CHECK-GI-NEXT: mov.16b v5, v2 +; CHECK-GI-NEXT: mov.h v2[1], w8 +; CHECK-GI-NEXT: mov.h v3[1], w8 +; CHECK-GI-NEXT: mov.h v0[1], w8 +; CHECK-GI-NEXT: mov.h v5[1], w8 +; CHECK-GI-NEXT: mov.h v4[2], w8 +; CHECK-GI-NEXT: xtn.4h v1, v1 +; CHECK-GI-NEXT: mov.h v2[2], w8 +; CHECK-GI-NEXT: mov.h v3[2], w9 +; CHECK-GI-NEXT: mov.h v0[2], w9 +; CHECK-GI-NEXT: mov.h v5[2], w9 +; CHECK-GI-NEXT: mov.h v4[3], w9 +; CHECK-GI-NEXT: mov.h v2[3], w9 +; CHECK-GI-NEXT: mov.h v3[3], w9 +; CHECK-GI-NEXT: mov.h v0[3], w8 +; CHECK-GI-NEXT: mov.h v5[3], w8 +; CHECK-GI-NEXT: orr.8b v1, v1, v3 +; CHECK-GI-NEXT: eor.8b v0, v1, v0 +; CHECK-GI-NEXT: eor.8b v1, v4, v1 +; CHECK-GI-NEXT: and.8b v0, v0, v5 +; CHECK-GI-NEXT: orr.8b v1, v2, v1 +; CHECK-GI-NEXT: orr.8b v0, v0, v1 +; CHECK-GI-NEXT: ushll.4s v0, v0, #0 +; CHECK-GI-NEXT: mov.s w8, v0[1] +; CHECK-GI-NEXT: mov.s w9, v0[2] +; CHECK-GI-NEXT: fmov w11, s0 +; CHECK-GI-NEXT: mov.s w10, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer @@ -440,42 +441,42 @@ define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, < } define i4 @convert_to_bitmask_with_different_types_in_chain(<4 x i16> %vec1, <4 x i32> %vec2) { -; SDAG-LABEL: convert_to_bitmask_with_different_types_in_chain: -; SDAG: ; %bb.0: -; SDAG-NEXT: cmeq.4s v1, v1, #0 -; SDAG-NEXT: cmeq.4h v0, v0, #0 -; SDAG-NEXT: adrp x8, lCPI9_0@PAGE -; SDAG-NEXT: xtn.4h v1, v1 -; SDAG-NEXT: orn.8b v0, v1, v0 -; SDAG-NEXT: ldr d1, [x8, lCPI9_0@PAGEOFF] -; SDAG-NEXT: and.8b v0, v0, v1 -; SDAG-NEXT: addv.4h h0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask_with_different_types_in_chain: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: cmeq.4s v1, v1, #0 +; CHECK-SD-NEXT: cmeq.4h v0, v0, #0 +; CHECK-SD-NEXT: adrp x8, lCPI9_0@PAGE +; CHECK-SD-NEXT: xtn.4h v1, v1 +; CHECK-SD-NEXT: orn.8b v0, v1, v0 +; CHECK-SD-NEXT: ldr d1, [x8, lCPI9_0@PAGEOFF] +; CHECK-SD-NEXT: and.8b v0, v0, v1 +; CHECK-SD-NEXT: addv.4h h0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask_with_different_types_in_chain: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: cmeq.4s v1, v1, #0 -; GISEL-NEXT: cmeq.4h v0, v0, #0 -; GISEL-NEXT: xtn.4h v1, v1 -; GISEL-NEXT: orn.8b v0, v1, v0 -; GISEL-NEXT: ushll.4s v0, v0, #0 -; GISEL-NEXT: mov.s w8, v0[1] -; GISEL-NEXT: mov.s w9, v0[2] -; GISEL-NEXT: fmov w11, s0 -; GISEL-NEXT: mov.s w10, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w11, w8, #1, #31 -; GISEL-NEXT: and w8, w9, #0x1 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w11, w8, lsl #2 -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask_with_different_types_in_chain: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: cmeq.4s v1, v1, #0 +; CHECK-GI-NEXT: cmeq.4h v0, v0, #0 +; CHECK-GI-NEXT: xtn.4h v1, v1 +; CHECK-GI-NEXT: orn.8b v0, v1, v0 +; CHECK-GI-NEXT: ushll.4s v0, v0, #0 +; CHECK-GI-NEXT: mov.s w8, v0[1] +; CHECK-GI-NEXT: mov.s w9, v0[2] +; CHECK-GI-NEXT: fmov w11, s0 +; CHECK-GI-NEXT: mov.s w10, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp1 = icmp ne <4 x i16> %vec1, zeroinitializer @@ -486,73 +487,73 @@ define i4 @convert_to_bitmask_with_different_types_in_chain(<4 x i16> %vec1, <4 } define i16 @convert_to_bitmask_without_knowing_type(<16 x i1> %vec) { -; SDAG-LABEL: convert_to_bitmask_without_knowing_type: -; SDAG: ; %bb.0: -; SDAG-NEXT: shl.16b v0, v0, #7 -; SDAG-NEXT: adrp x8, lCPI10_0@PAGE -; SDAG-NEXT: ldr q1, [x8, lCPI10_0@PAGEOFF] -; SDAG-NEXT: cmlt.16b v0, v0, #0 -; SDAG-NEXT: and.16b v0, v0, v1 -; SDAG-NEXT: ext.16b v1, v0, v0, #8 -; SDAG-NEXT: zip1.16b v0, v0, v1 -; SDAG-NEXT: addv.8h h0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask_without_knowing_type: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: shl.16b v0, v0, #7 +; CHECK-SD-NEXT: adrp x8, lCPI10_0@PAGE +; CHECK-SD-NEXT: ldr q1, [x8, lCPI10_0@PAGEOFF] +; CHECK-SD-NEXT: cmlt.16b v0, v0, #0 +; CHECK-SD-NEXT: and.16b v0, v0, v1 +; CHECK-SD-NEXT: ext.16b v1, v0, v0, #8 +; CHECK-SD-NEXT: zip1.16b v0, v0, v1 +; CHECK-SD-NEXT: addv.8h h0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask_without_knowing_type: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: umov.b w8, v0[1] -; GISEL-NEXT: umov.b w9, v0[0] -; GISEL-NEXT: umov.b w10, v0[2] -; GISEL-NEXT: umov.b w11, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w9, w8, #1, #31 -; GISEL-NEXT: and w8, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[4] -; GISEL-NEXT: orr w8, w9, w8, lsl #2 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[5] -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[6] -; GISEL-NEXT: orr w8, w8, w9, lsl #4 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[7] -; GISEL-NEXT: orr w8, w8, w9, lsl #5 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[8] -; GISEL-NEXT: orr w8, w8, w9, lsl #6 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[9] -; GISEL-NEXT: orr w8, w8, w9, lsl #7 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[10] -; GISEL-NEXT: orr w8, w8, w9, lsl #8 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[11] -; GISEL-NEXT: orr w8, w8, w9, lsl #9 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[12] -; GISEL-NEXT: orr w8, w8, w9, lsl #10 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[13] -; GISEL-NEXT: orr w8, w8, w9, lsl #11 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[14] -; GISEL-NEXT: orr w8, w8, w9, lsl #12 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[15] -; GISEL-NEXT: orr w8, w8, w9, lsl #13 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w8, w9, lsl #14 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: orr w8, w8, w9, lsl #15 -; GISEL-NEXT: strh w8, [sp, #14] -; GISEL-NEXT: and w0, w8, #0xffff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask_without_knowing_type: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: umov.b w8, v0[1] +; CHECK-GI-NEXT: umov.b w9, v0[0] +; CHECK-GI-NEXT: umov.b w10, v0[2] +; CHECK-GI-NEXT: umov.b w11, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w9, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[4] +; CHECK-GI-NEXT: orr w8, w9, w8, lsl #2 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[5] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[6] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #4 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[7] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #5 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[8] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #6 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[9] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #7 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[10] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #8 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[11] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #9 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[12] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #10 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[13] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #11 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[14] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #12 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[15] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #13 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #14 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #15 +; CHECK-GI-NEXT: strh w8, [sp, #14] +; CHECK-GI-NEXT: and w0, w8, #0xffff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %bitmask = bitcast <16 x i1> %vec to i16 ret i16 %bitmask @@ -575,51 +576,51 @@ define i2 @convert_to_bitmask_2xi32(<2 x i32> %vec) { } define i4 @convert_to_bitmask_4xi8(<4 x i8> %vec) { -; SDAG-LABEL: convert_to_bitmask_4xi8: -; SDAG: ; %bb.0: -; SDAG-NEXT: bic.4h v0, #255, lsl #8 -; SDAG-NEXT: adrp x8, lCPI12_0@PAGE -; SDAG-NEXT: ldr d1, [x8, lCPI12_0@PAGEOFF] -; SDAG-NEXT: cmeq.4h v0, v0, #0 -; SDAG-NEXT: bic.8b v0, v1, v0 -; SDAG-NEXT: addv.4h h0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask_4xi8: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: bic.4h v0, #255, lsl #8 +; CHECK-SD-NEXT: adrp x8, lCPI12_0@PAGE +; CHECK-SD-NEXT: ldr d1, [x8, lCPI12_0@PAGEOFF] +; CHECK-SD-NEXT: cmeq.4h v0, v0, #0 +; CHECK-SD-NEXT: bic.8b v0, v1, v0 +; CHECK-SD-NEXT: addv.4h h0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask_4xi8: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: mov w8, #0 ; =0x0 -; GISEL-NEXT: uzp1.8b v0, v0, v0 -; GISEL-NEXT: fmov s1, w8 -; GISEL-NEXT: mov.b v1[1], w8 -; GISEL-NEXT: mov.b v1[2], w8 -; GISEL-NEXT: mov.b v1[3], w8 -; GISEL-NEXT: cmeq.8b v0, v0, v1 -; GISEL-NEXT: mvn.8b v0, v0 -; GISEL-NEXT: umov.b w8, v0[0] -; GISEL-NEXT: umov.b w9, v0[1] -; GISEL-NEXT: mov.s v1[0], w8 -; GISEL-NEXT: umov.b w8, v0[2] -; GISEL-NEXT: mov.s v1[1], w9 -; GISEL-NEXT: umov.b w9, v0[3] -; GISEL-NEXT: mov.s v1[2], w8 -; GISEL-NEXT: mov.s v1[3], w9 -; GISEL-NEXT: mov.s w8, v1[1] -; GISEL-NEXT: mov.s w9, v1[2] -; GISEL-NEXT: fmov w11, s1 -; GISEL-NEXT: mov.s w10, v1[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w11, w8, #1, #31 -; GISEL-NEXT: and w8, w9, #0x1 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w11, w8, lsl #2 -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask_4xi8: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov w8, #0 ; =0x0 +; CHECK-GI-NEXT: uzp1.8b v0, v0, v0 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov.b v1[1], w8 +; CHECK-GI-NEXT: mov.b v1[2], w8 +; CHECK-GI-NEXT: mov.b v1[3], w8 +; CHECK-GI-NEXT: cmeq.8b v0, v0, v1 +; CHECK-GI-NEXT: mvn.8b v0, v0 +; CHECK-GI-NEXT: umov.b w8, v0[0] +; CHECK-GI-NEXT: umov.b w9, v0[1] +; CHECK-GI-NEXT: mov.s v1[0], w8 +; CHECK-GI-NEXT: umov.b w8, v0[2] +; CHECK-GI-NEXT: mov.s v1[1], w9 +; CHECK-GI-NEXT: umov.b w9, v0[3] +; CHECK-GI-NEXT: mov.s v1[2], w8 +; CHECK-GI-NEXT: mov.s v1[3], w9 +; CHECK-GI-NEXT: mov.s w8, v1[1] +; CHECK-GI-NEXT: mov.s w9, v1[2] +; CHECK-GI-NEXT: fmov w11, s1 +; CHECK-GI-NEXT: mov.s w10, v1[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp_result = icmp ne <4 x i8> %vec, zeroinitializer %bitmask = bitcast <4 x i1> %cmp_result to i4 @@ -645,39 +646,39 @@ define i8 @convert_to_bitmask_8xi2(<8 x i2> %vec) { } define i4 @convert_to_bitmask_float(<4 x float> %vec) { -; SDAG-LABEL: convert_to_bitmask_float: -; SDAG: ; %bb.0: -; SDAG-NEXT: fcmgt.4s v1, v0, #0.0 -; SDAG-NEXT: fcmlt.4s v0, v0, #0.0 -; SDAG-NEXT: adrp x8, lCPI14_0@PAGE -; SDAG-NEXT: orr.16b v0, v0, v1 -; SDAG-NEXT: ldr q1, [x8, lCPI14_0@PAGEOFF] -; SDAG-NEXT: and.16b v0, v0, v1 -; SDAG-NEXT: addv.4s s0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask_float: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: fcmgt.4s v1, v0, #0.0 +; CHECK-SD-NEXT: fcmlt.4s v0, v0, #0.0 +; CHECK-SD-NEXT: adrp x8, lCPI14_0@PAGE +; CHECK-SD-NEXT: orr.16b v0, v0, v1 +; CHECK-SD-NEXT: ldr q1, [x8, lCPI14_0@PAGEOFF] +; CHECK-SD-NEXT: and.16b v0, v0, v1 +; CHECK-SD-NEXT: addv.4s s0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask_float: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: fcmgt.4s v1, v0, #0.0 -; GISEL-NEXT: fcmlt.4s v0, v0, #0.0 -; GISEL-NEXT: orr.16b v0, v0, v1 -; GISEL-NEXT: mov.s w8, v0[1] -; GISEL-NEXT: mov.s w9, v0[2] -; GISEL-NEXT: fmov w11, s0 -; GISEL-NEXT: mov.s w10, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w11, w8, #1, #31 -; GISEL-NEXT: and w8, w9, #0x1 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w11, w8, lsl #2 -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask_float: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: fcmgt.4s v1, v0, #0.0 +; CHECK-GI-NEXT: fcmlt.4s v0, v0, #0.0 +; CHECK-GI-NEXT: orr.16b v0, v0, v1 +; CHECK-GI-NEXT: mov.s w8, v0[1] +; CHECK-GI-NEXT: mov.s w9, v0[2] +; CHECK-GI-NEXT: fmov w11, s0 +; CHECK-GI-NEXT: mov.s w10, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp_result = fcmp one <4 x float> %vec, zeroinitializer @@ -688,58 +689,58 @@ define i4 @convert_to_bitmask_float(<4 x float> %vec) { ; Larger vector types don't map directly, but the can be split/truncated and then converted. ; After the comparison against 0, this is truncated to <8 x i16>, which is valid again. define i8 @convert_large_vector(<8 x i32> %vec) { -; SDAG-LABEL: convert_large_vector: -; SDAG: ; %bb.0: -; SDAG-NEXT: sub sp, sp, #16 -; SDAG-NEXT: .cfi_def_cfa_offset 16 -; SDAG-NEXT: cmeq.4s v1, v1, #0 -; SDAG-NEXT: cmeq.4s v0, v0, #0 -; SDAG-NEXT: adrp x8, lCPI15_0@PAGE -; SDAG-NEXT: uzp1.8h v0, v0, v1 -; SDAG-NEXT: ldr q1, [x8, lCPI15_0@PAGEOFF] -; SDAG-NEXT: bic.16b v0, v1, v0 -; SDAG-NEXT: addv.8h h0, v0 -; SDAG-NEXT: fmov w8, s0 -; SDAG-NEXT: and w0, w8, #0xff -; SDAG-NEXT: add sp, sp, #16 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_large_vector: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: cmeq.4s v1, v1, #0 +; CHECK-SD-NEXT: cmeq.4s v0, v0, #0 +; CHECK-SD-NEXT: adrp x8, lCPI15_0@PAGE +; CHECK-SD-NEXT: uzp1.8h v0, v0, v1 +; CHECK-SD-NEXT: ldr q1, [x8, lCPI15_0@PAGEOFF] +; CHECK-SD-NEXT: bic.16b v0, v1, v0 +; CHECK-SD-NEXT: addv.8h h0, v0 +; CHECK-SD-NEXT: fmov w8, s0 +; CHECK-SD-NEXT: and w0, w8, #0xff +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_large_vector: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: cmeq.4s v0, v0, #0 -; GISEL-NEXT: cmeq.4s v1, v1, #0 -; GISEL-NEXT: mvn.16b v0, v0 -; GISEL-NEXT: mvn.16b v1, v1 -; GISEL-NEXT: uzp1.8h v0, v0, v1 -; GISEL-NEXT: xtn.8b v0, v0 -; GISEL-NEXT: umov.b w8, v0[1] -; GISEL-NEXT: umov.b w9, v0[0] -; GISEL-NEXT: umov.b w10, v0[2] -; GISEL-NEXT: umov.b w11, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w9, w8, #1, #31 -; GISEL-NEXT: and w8, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[4] -; GISEL-NEXT: orr w8, w9, w8, lsl #2 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[5] -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[6] -; GISEL-NEXT: orr w8, w8, w9, lsl #4 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[7] -; GISEL-NEXT: orr w8, w8, w9, lsl #5 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w8, w9, lsl #6 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: orr w8, w8, w9, lsl #7 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_large_vector: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: cmeq.4s v0, v0, #0 +; CHECK-GI-NEXT: cmeq.4s v1, v1, #0 +; CHECK-GI-NEXT: mvn.16b v0, v0 +; CHECK-GI-NEXT: mvn.16b v1, v1 +; CHECK-GI-NEXT: uzp1.8h v0, v0, v1 +; CHECK-GI-NEXT: xtn.8b v0, v0 +; CHECK-GI-NEXT: umov.b w8, v0[1] +; CHECK-GI-NEXT: umov.b w9, v0[0] +; CHECK-GI-NEXT: umov.b w10, v0[2] +; CHECK-GI-NEXT: umov.b w11, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w9, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[4] +; CHECK-GI-NEXT: orr w8, w9, w8, lsl #2 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[5] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[6] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #4 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[7] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #5 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #6 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #7 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp_result = icmp ne <8 x i32> %vec, zeroinitializer @@ -748,40 +749,40 @@ define i8 @convert_large_vector(<8 x i32> %vec) { } define i4 @convert_legalized_illegal_element_size(<4 x i22> %vec) { -; SDAG-LABEL: convert_legalized_illegal_element_size: -; SDAG: ; %bb.0: -; SDAG-NEXT: movi.4s v1, #63, msl #16 -; SDAG-NEXT: adrp x8, lCPI16_0@PAGE -; SDAG-NEXT: cmtst.4s v0, v0, v1 -; SDAG-NEXT: ldr d1, [x8, lCPI16_0@PAGEOFF] -; SDAG-NEXT: xtn.4h v0, v0 -; SDAG-NEXT: and.8b v0, v0, v1 -; SDAG-NEXT: addv.4h h0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_legalized_illegal_element_size: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: movi.4s v1, #63, msl #16 +; CHECK-SD-NEXT: adrp x8, lCPI16_0@PAGE +; CHECK-SD-NEXT: cmtst.4s v0, v0, v1 +; CHECK-SD-NEXT: ldr d1, [x8, lCPI16_0@PAGEOFF] +; CHECK-SD-NEXT: xtn.4h v0, v0 +; CHECK-SD-NEXT: and.8b v0, v0, v1 +; CHECK-SD-NEXT: addv.4h h0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_legalized_illegal_element_size: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: movi.4s v1, #63, msl #16 -; GISEL-NEXT: and.16b v0, v0, v1 -; GISEL-NEXT: cmeq.4s v0, v0, #0 -; GISEL-NEXT: mvn.16b v0, v0 -; GISEL-NEXT: mov.s w8, v0[1] -; GISEL-NEXT: mov.s w9, v0[2] -; GISEL-NEXT: fmov w11, s0 -; GISEL-NEXT: mov.s w10, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w11, w8, #1, #31 -; GISEL-NEXT: and w8, w9, #0x1 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w11, w8, lsl #2 -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_legalized_illegal_element_size: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: movi.4s v1, #63, msl #16 +; CHECK-GI-NEXT: and.16b v0, v0, v1 +; CHECK-GI-NEXT: cmeq.4s v0, v0, #0 +; CHECK-GI-NEXT: mvn.16b v0, v0 +; CHECK-GI-NEXT: mov.s w8, v0[1] +; CHECK-GI-NEXT: mov.s w9, v0[2] +; CHECK-GI-NEXT: fmov w11, s0 +; CHECK-GI-NEXT: mov.s w10, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp_result = icmp ne <4 x i22> %vec, zeroinitializer %bitmask = bitcast <4 x i1> %cmp_result to i4 @@ -818,101 +819,101 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) { } define <8 x i1> @no_convert_without_direct_bitcast(<8 x i16> %vec) { -; SDAG-LABEL: no_convert_without_direct_bitcast: -; SDAG: ; %bb.0: -; SDAG-NEXT: cmtst.8h v0, v0, v0 -; SDAG-NEXT: xtn.8b v0, v0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: no_convert_without_direct_bitcast: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: cmtst.8h v0, v0, v0 +; CHECK-SD-NEXT: xtn.8b v0, v0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: no_convert_without_direct_bitcast: -; GISEL: ; %bb.0: -; GISEL-NEXT: cmeq.8h v0, v0, #0 -; GISEL-NEXT: mvn.16b v0, v0 -; GISEL-NEXT: xtn.8b v0, v0 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: no_convert_without_direct_bitcast: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: cmeq.8h v0, v0, #0 +; CHECK-GI-NEXT: mvn.16b v0, v0 +; CHECK-GI-NEXT: xtn.8b v0, v0 +; CHECK-GI-NEXT: ret %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer ret <8 x i1> %cmp_result } define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) { -; SDAG-LABEL: no_combine_illegal_num_elements: -; SDAG: ; %bb.0: -; SDAG-NEXT: sub sp, sp, #16 -; SDAG-NEXT: .cfi_def_cfa_offset 16 -; SDAG-NEXT: fmov s0, w0 -; SDAG-NEXT: fmov s1, w4 -; SDAG-NEXT: mov.s v0[1], w1 -; SDAG-NEXT: mov.s v1[1], w5 -; SDAG-NEXT: mov.s v0[2], w2 -; SDAG-NEXT: cmeq.4s v1, v1, #0 -; SDAG-NEXT: mov.s v0[3], w3 -; SDAG-NEXT: cmeq.4s v0, v0, #0 -; SDAG-NEXT: uzp1.8h v0, v0, v1 -; SDAG-NEXT: mvn.16b v0, v0 -; SDAG-NEXT: xtn.8b v0, v0 -; SDAG-NEXT: umov.b w8, v0[0] -; SDAG-NEXT: umov.b w9, v0[1] -; SDAG-NEXT: umov.b w10, v0[2] -; SDAG-NEXT: and w8, w8, #0x1 -; SDAG-NEXT: bfi w8, w9, #1, #1 -; SDAG-NEXT: umov.b w9, v0[3] -; SDAG-NEXT: bfi w8, w10, #2, #1 -; SDAG-NEXT: umov.b w10, v0[4] -; SDAG-NEXT: bfi w8, w9, #3, #1 -; SDAG-NEXT: umov.b w9, v0[5] -; SDAG-NEXT: bfi w8, w10, #4, #1 -; SDAG-NEXT: orr w8, w8, w9, lsl #5 -; SDAG-NEXT: and w0, w8, #0x3f -; SDAG-NEXT: add sp, sp, #16 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: no_combine_illegal_num_elements: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: fmov s1, w4 +; CHECK-SD-NEXT: mov.s v0[1], w1 +; CHECK-SD-NEXT: mov.s v1[1], w5 +; CHECK-SD-NEXT: mov.s v0[2], w2 +; CHECK-SD-NEXT: cmeq.4s v1, v1, #0 +; CHECK-SD-NEXT: mov.s v0[3], w3 +; CHECK-SD-NEXT: cmeq.4s v0, v0, #0 +; CHECK-SD-NEXT: uzp1.8h v0, v0, v1 +; CHECK-SD-NEXT: mvn.16b v0, v0 +; CHECK-SD-NEXT: xtn.8b v0, v0 +; CHECK-SD-NEXT: umov.b w8, v0[0] +; CHECK-SD-NEXT: umov.b w9, v0[1] +; CHECK-SD-NEXT: umov.b w10, v0[2] +; CHECK-SD-NEXT: and w8, w8, #0x1 +; CHECK-SD-NEXT: bfi w8, w9, #1, #1 +; CHECK-SD-NEXT: umov.b w9, v0[3] +; CHECK-SD-NEXT: bfi w8, w10, #2, #1 +; CHECK-SD-NEXT: umov.b w10, v0[4] +; CHECK-SD-NEXT: bfi w8, w9, #3, #1 +; CHECK-SD-NEXT: umov.b w9, v0[5] +; CHECK-SD-NEXT: bfi w8, w10, #4, #1 +; CHECK-SD-NEXT: orr w8, w8, w9, lsl #5 +; CHECK-SD-NEXT: and w0, w8, #0x3f +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: no_combine_illegal_num_elements: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: mov.s v0[0], w0 -; GISEL-NEXT: mov.s v1[0], w4 -; GISEL-NEXT: mov.s v2[0], wzr -; GISEL-NEXT: mov.s v0[1], w1 -; GISEL-NEXT: mov.s v1[1], w5 -; GISEL-NEXT: mov.s v2[1], wzr -; GISEL-NEXT: mov.s v0[2], w2 -; GISEL-NEXT: cmeq.4s v1, v1, v2 -; GISEL-NEXT: mvn.16b v1, v1 -; GISEL-NEXT: mov.s v0[3], w3 -; GISEL-NEXT: cmeq.4s v0, v0, #0 -; GISEL-NEXT: mvn.16b v0, v0 -; GISEL-NEXT: mov.s w8, v0[1] -; GISEL-NEXT: mov.s w9, v0[2] -; GISEL-NEXT: mov.s w10, v0[3] -; GISEL-NEXT: mov.h v0[1], w8 -; GISEL-NEXT: mov.s w8, v1[1] -; GISEL-NEXT: mov.h v0[2], w9 -; GISEL-NEXT: mov.h v0[3], w10 -; GISEL-NEXT: mov.h v0[4], v1[0] -; GISEL-NEXT: mov.h v0[5], w8 -; GISEL-NEXT: umov.h w8, v0[1] -; GISEL-NEXT: umov.h w9, v0[0] -; GISEL-NEXT: umov.h w10, v0[2] -; GISEL-NEXT: umov.h w11, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w9, w8, #1, #31 -; GISEL-NEXT: and w8, w10, #0x1 -; GISEL-NEXT: umov.h w10, v0[4] -; GISEL-NEXT: orr w8, w9, w8, lsl #2 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.h w11, v0[5] -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w8, w9, lsl #4 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: orr w8, w8, w9, lsl #5 -; GISEL-NEXT: and w8, w8, #0x3f -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: no_combine_illegal_num_elements: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov.s v0[0], w0 +; CHECK-GI-NEXT: mov.s v1[0], w4 +; CHECK-GI-NEXT: mov.s v2[0], wzr +; CHECK-GI-NEXT: mov.s v0[1], w1 +; CHECK-GI-NEXT: mov.s v1[1], w5 +; CHECK-GI-NEXT: mov.s v2[1], wzr +; CHECK-GI-NEXT: mov.s v0[2], w2 +; CHECK-GI-NEXT: cmeq.4s v1, v1, v2 +; CHECK-GI-NEXT: mvn.16b v1, v1 +; CHECK-GI-NEXT: mov.s v0[3], w3 +; CHECK-GI-NEXT: cmeq.4s v0, v0, #0 +; CHECK-GI-NEXT: mvn.16b v0, v0 +; CHECK-GI-NEXT: mov.s w8, v0[1] +; CHECK-GI-NEXT: mov.s w9, v0[2] +; CHECK-GI-NEXT: mov.s w10, v0[3] +; CHECK-GI-NEXT: mov.h v0[1], w8 +; CHECK-GI-NEXT: mov.s w8, v1[1] +; CHECK-GI-NEXT: mov.h v0[2], w9 +; CHECK-GI-NEXT: mov.h v0[3], w10 +; CHECK-GI-NEXT: mov.h v0[4], v1[0] +; CHECK-GI-NEXT: mov.h v0[5], w8 +; CHECK-GI-NEXT: umov.h w8, v0[1] +; CHECK-GI-NEXT: umov.h w9, v0[0] +; CHECK-GI-NEXT: umov.h w10, v0[2] +; CHECK-GI-NEXT: umov.h w11, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w9, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w10, #0x1 +; CHECK-GI-NEXT: umov.h w10, v0[4] +; CHECK-GI-NEXT: orr w8, w9, w8, lsl #2 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.h w11, v0[5] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #4 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #5 +; CHECK-GI-NEXT: and w8, w8, #0x3f +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp_result = icmp ne <6 x i32> %vec, zeroinitializer %bitmask = bitcast <6 x i1> %cmp_result to i6 @@ -921,220 +922,220 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) { ; Only apply the combine when casting a vector to a scalar. define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind { -; SDAG-LABEL: vector_to_vector_cast: -; SDAG: ; %bb.0: -; SDAG-NEXT: sub sp, sp, #16 -; SDAG-NEXT: shl.16b v0, v0, #7 -; SDAG-NEXT: adrp x8, lCPI20_0@PAGE -; SDAG-NEXT: ldr q1, [x8, lCPI20_0@PAGEOFF] -; SDAG-NEXT: add x8, sp, #14 -; SDAG-NEXT: cmlt.16b v0, v0, #0 -; SDAG-NEXT: and.16b v0, v0, v1 -; SDAG-NEXT: ext.16b v1, v0, v0, #8 -; SDAG-NEXT: zip1.16b v0, v0, v1 -; SDAG-NEXT: addv.8h h0, v0 -; SDAG-NEXT: str h0, [sp, #14] -; SDAG-NEXT: ld1.b { v0 }[0], [x8] -; SDAG-NEXT: orr x8, x8, #0x1 -; SDAG-NEXT: ld1.b { v0 }[4], [x8] -; SDAG-NEXT: ; kill: def $d0 killed $d0 killed $q0 -; SDAG-NEXT: add sp, sp, #16 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: vector_to_vector_cast: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: shl.16b v0, v0, #7 +; CHECK-SD-NEXT: adrp x8, lCPI20_0@PAGE +; CHECK-SD-NEXT: ldr q1, [x8, lCPI20_0@PAGEOFF] +; CHECK-SD-NEXT: add x8, sp, #14 +; CHECK-SD-NEXT: cmlt.16b v0, v0, #0 +; CHECK-SD-NEXT: and.16b v0, v0, v1 +; CHECK-SD-NEXT: ext.16b v1, v0, v0, #8 +; CHECK-SD-NEXT: zip1.16b v0, v0, v1 +; CHECK-SD-NEXT: addv.8h h0, v0 +; CHECK-SD-NEXT: str h0, [sp, #14] +; CHECK-SD-NEXT: ld1.b { v0 }[0], [x8] +; CHECK-SD-NEXT: orr x8, x8, #0x1 +; CHECK-SD-NEXT: ld1.b { v0 }[4], [x8] +; CHECK-SD-NEXT: ; kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: vector_to_vector_cast: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: umov.b w8, v0[1] -; GISEL-NEXT: mov d1, v0[1] -; GISEL-NEXT: umov.b w10, v0[1] -; GISEL-NEXT: umov.b w9, v0[0] -; GISEL-NEXT: umov.b w13, v0[0] -; GISEL-NEXT: umov.b w14, v0[2] -; GISEL-NEXT: umov.b w15, v0[3] -; GISEL-NEXT: umov.b w11, v0[2] -; GISEL-NEXT: umov.b w16, v0[4] -; GISEL-NEXT: umov.b w17, v0[5] -; GISEL-NEXT: umov.b w12, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: and w10, w10, #0x1 -; GISEL-NEXT: umov.b w0, v1[1] -; GISEL-NEXT: bfi w9, w8, #1, #31 -; GISEL-NEXT: bfi w13, w10, #1, #31 -; GISEL-NEXT: and w14, w14, #0x1 -; GISEL-NEXT: umov.b w8, v1[0] -; GISEL-NEXT: umov.b w10, v1[2] -; GISEL-NEXT: and w15, w15, #0x1 -; GISEL-NEXT: orr w13, w13, w14, lsl #2 -; GISEL-NEXT: umov.b w14, v1[3] -; GISEL-NEXT: and w11, w11, #0x1 -; GISEL-NEXT: and w0, w0, #0x1 -; GISEL-NEXT: and w16, w16, #0x1 -; GISEL-NEXT: orr w9, w9, w11, lsl #2 -; GISEL-NEXT: orr w13, w13, w15, lsl #3 -; GISEL-NEXT: umov.b w15, v1[4] -; GISEL-NEXT: umov.b w11, v0[6] -; GISEL-NEXT: bfi w8, w0, #1, #31 -; GISEL-NEXT: and w10, w10, #0x1 -; GISEL-NEXT: and w17, w17, #0x1 -; GISEL-NEXT: orr w13, w13, w16, lsl #4 -; GISEL-NEXT: and w14, w14, #0x1 -; GISEL-NEXT: umov.b w0, v0[7] -; GISEL-NEXT: orr w8, w8, w10, lsl #2 -; GISEL-NEXT: umov.b w10, v1[5] -; GISEL-NEXT: umov.b w16, v1[6] -; GISEL-NEXT: orr w13, w13, w17, lsl #5 -; GISEL-NEXT: umov.b w17, v0[4] -; GISEL-NEXT: and w15, w15, #0x1 -; GISEL-NEXT: orr w8, w8, w14, lsl #3 -; GISEL-NEXT: and w12, w12, #0x1 -; GISEL-NEXT: and w11, w11, #0x1 -; GISEL-NEXT: umov.b w14, v1[7] -; GISEL-NEXT: orr w9, w9, w12, lsl #3 -; GISEL-NEXT: orr w11, w13, w11, lsl #6 -; GISEL-NEXT: orr w8, w8, w15, lsl #4 -; GISEL-NEXT: umov.b w15, v0[5] -; GISEL-NEXT: and w10, w10, #0x1 -; GISEL-NEXT: and w0, w0, #0x1 -; GISEL-NEXT: and w12, w17, #0x1 -; GISEL-NEXT: umov.b w13, v0[1] -; GISEL-NEXT: orr w8, w8, w10, lsl #5 -; GISEL-NEXT: and w16, w16, #0x1 -; GISEL-NEXT: orr w9, w9, w12, lsl #4 -; GISEL-NEXT: umov.b w10, v0[0] -; GISEL-NEXT: orr w11, w11, w0, lsl #7 -; GISEL-NEXT: and w14, w14, #0x1 -; GISEL-NEXT: and w12, w15, #0x1 -; GISEL-NEXT: umov.b w15, v0[2] -; GISEL-NEXT: orr w8, w8, w16, lsl #6 -; GISEL-NEXT: orr w9, w9, w12, lsl #5 -; GISEL-NEXT: umov.b w12, v0[6] -; GISEL-NEXT: strb w11, [sp, #8] -; GISEL-NEXT: and w11, w13, #0x1 -; GISEL-NEXT: umov.b w13, v0[3] -; GISEL-NEXT: orr w8, w8, w14, lsl #7 -; GISEL-NEXT: umov.b w14, v0[7] -; GISEL-NEXT: ldr b0, [sp, #8] -; GISEL-NEXT: bfi w10, w11, #1, #31 -; GISEL-NEXT: and w11, w15, #0x1 -; GISEL-NEXT: strb w8, [sp, #9] -; GISEL-NEXT: umov.b w15, v0[4] -; GISEL-NEXT: and w8, w12, #0x1 -; GISEL-NEXT: orr w10, w10, w11, lsl #2 -; GISEL-NEXT: orr w8, w9, w8, lsl #6 -; GISEL-NEXT: and w9, w13, #0x1 -; GISEL-NEXT: umov.b w11, v0[1] -; GISEL-NEXT: orr w9, w10, w9, lsl #3 -; GISEL-NEXT: umov.b w10, v0[5] -; GISEL-NEXT: umov.b w12, v0[0] -; GISEL-NEXT: and w13, w14, #0x1 -; GISEL-NEXT: umov.b w16, v0[2] -; GISEL-NEXT: umov.b w17, v0[3] -; GISEL-NEXT: and w14, w15, #0x1 -; GISEL-NEXT: umov.b w15, v0[2] -; GISEL-NEXT: orr w8, w8, w13, lsl #7 -; GISEL-NEXT: orr w9, w9, w14, lsl #4 -; GISEL-NEXT: umov.b w13, v0[6] -; GISEL-NEXT: and w11, w11, #0x1 -; GISEL-NEXT: umov.b w14, v0[3] -; GISEL-NEXT: strb w8, [sp, #10] -; GISEL-NEXT: and w8, w10, #0x1 -; GISEL-NEXT: bfi w12, w11, #1, #31 -; GISEL-NEXT: orr w8, w9, w8, lsl #5 -; GISEL-NEXT: umov.b w10, v0[4] -; GISEL-NEXT: and w9, w15, #0x1 -; GISEL-NEXT: umov.b w11, v0[7] -; GISEL-NEXT: umov.b w15, v0[1] -; GISEL-NEXT: orr w9, w12, w9, lsl #2 -; GISEL-NEXT: umov.b w12, v0[5] -; GISEL-NEXT: and w13, w13, #0x1 -; GISEL-NEXT: and w14, w14, #0x1 -; GISEL-NEXT: orr w8, w8, w13, lsl #6 -; GISEL-NEXT: umov.b w13, v0[0] -; GISEL-NEXT: orr w9, w9, w14, lsl #3 -; GISEL-NEXT: and w10, w10, #0x1 -; GISEL-NEXT: umov.b w14, v0[6] -; GISEL-NEXT: and w11, w11, #0x1 -; GISEL-NEXT: and w15, w15, #0x1 -; GISEL-NEXT: umov.b w0, v0[3] -; GISEL-NEXT: orr w9, w9, w10, lsl #4 -; GISEL-NEXT: and w10, w12, #0x1 -; GISEL-NEXT: umov.b w12, v0[7] -; GISEL-NEXT: orr w8, w8, w11, lsl #7 -; GISEL-NEXT: bfi w13, w15, #1, #31 -; GISEL-NEXT: and w11, w16, #0x1 -; GISEL-NEXT: orr w9, w9, w10, lsl #5 -; GISEL-NEXT: and w10, w14, #0x1 -; GISEL-NEXT: umov.b w14, v0[4] -; GISEL-NEXT: strb w8, [sp, #11] -; GISEL-NEXT: umov.b w15, v0[1] -; GISEL-NEXT: umov.b w16, v0[3] -; GISEL-NEXT: orr w8, w9, w10, lsl #6 -; GISEL-NEXT: orr w9, w13, w11, lsl #2 -; GISEL-NEXT: and w10, w12, #0x1 -; GISEL-NEXT: and w11, w17, #0x1 -; GISEL-NEXT: umov.b w12, v0[5] -; GISEL-NEXT: umov.b w17, v0[0] -; GISEL-NEXT: orr w8, w8, w10, lsl #7 -; GISEL-NEXT: orr w9, w9, w11, lsl #3 -; GISEL-NEXT: umov.b w10, v0[1] -; GISEL-NEXT: and w11, w14, #0x1 -; GISEL-NEXT: umov.b w14, v0[0] -; GISEL-NEXT: and w15, w15, #0x1 -; GISEL-NEXT: orr w9, w9, w11, lsl #4 -; GISEL-NEXT: umov.b w11, v0[2] -; GISEL-NEXT: umov.b w13, v0[6] -; GISEL-NEXT: and w12, w12, #0x1 -; GISEL-NEXT: bfi w17, w15, #1, #31 -; GISEL-NEXT: umov.b w15, v0[5] -; GISEL-NEXT: orr w9, w9, w12, lsl #5 -; GISEL-NEXT: and w10, w10, #0x1 -; GISEL-NEXT: umov.b w12, v0[2] -; GISEL-NEXT: bfi w14, w10, #1, #31 -; GISEL-NEXT: umov.b w10, v0[4] -; GISEL-NEXT: ldr b1, [sp, #9] -; GISEL-NEXT: and w11, w11, #0x1 -; GISEL-NEXT: and w13, w13, #0x1 -; GISEL-NEXT: strb w8, [sp, #12] -; GISEL-NEXT: orr w11, w14, w11, lsl #2 -; GISEL-NEXT: and w14, w16, #0x1 -; GISEL-NEXT: umov.b w16, v0[4] -; GISEL-NEXT: and w12, w12, #0x1 -; GISEL-NEXT: and w15, w15, #0x1 -; GISEL-NEXT: orr w9, w9, w13, lsl #6 -; GISEL-NEXT: orr w11, w11, w14, lsl #3 -; GISEL-NEXT: orr w12, w17, w12, lsl #2 -; GISEL-NEXT: and w10, w10, #0x1 -; GISEL-NEXT: and w17, w0, #0x1 -; GISEL-NEXT: umov.b w0, v0[5] -; GISEL-NEXT: umov.b w14, v0[6] -; GISEL-NEXT: orr w10, w11, w10, lsl #4 -; GISEL-NEXT: orr w12, w12, w17, lsl #3 -; GISEL-NEXT: umov.b w11, v0[7] -; GISEL-NEXT: and w16, w16, #0x1 -; GISEL-NEXT: umov.b w17, v0[6] -; GISEL-NEXT: orr w10, w10, w15, lsl #5 -; GISEL-NEXT: umov.b w15, v0[7] -; GISEL-NEXT: orr w12, w12, w16, lsl #4 -; GISEL-NEXT: and w16, w0, #0x1 -; GISEL-NEXT: umov.b w0, v0[7] -; GISEL-NEXT: and w14, w14, #0x1 -; GISEL-NEXT: orr w12, w12, w16, lsl #5 -; GISEL-NEXT: orr w10, w10, w14, lsl #6 -; GISEL-NEXT: and w11, w11, #0x1 -; GISEL-NEXT: and w13, w17, #0x1 -; GISEL-NEXT: orr w9, w9, w11, lsl #7 -; GISEL-NEXT: mov.s v0[1], v1[0] -; GISEL-NEXT: orr w11, w12, w13, lsl #6 -; GISEL-NEXT: and w12, w15, #0x1 -; GISEL-NEXT: ; kill: def $d0 killed $d0 killed $q0 -; GISEL-NEXT: orr w8, w10, w12, lsl #7 -; GISEL-NEXT: and w10, w0, #0x1 -; GISEL-NEXT: strb w9, [sp, #13] -; GISEL-NEXT: orr w9, w11, w10, lsl #7 -; GISEL-NEXT: strb w8, [sp, #14] -; GISEL-NEXT: strb w9, [sp, #15] -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: vector_to_vector_cast: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: umov.b w8, v0[1] +; CHECK-GI-NEXT: mov d1, v0[1] +; CHECK-GI-NEXT: umov.b w10, v0[1] +; CHECK-GI-NEXT: umov.b w9, v0[0] +; CHECK-GI-NEXT: umov.b w13, v0[0] +; CHECK-GI-NEXT: umov.b w14, v0[2] +; CHECK-GI-NEXT: umov.b w15, v0[3] +; CHECK-GI-NEXT: umov.b w11, v0[2] +; CHECK-GI-NEXT: umov.b w16, v0[4] +; CHECK-GI-NEXT: umov.b w17, v0[5] +; CHECK-GI-NEXT: umov.b w12, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: and w10, w10, #0x1 +; CHECK-GI-NEXT: umov.b w0, v1[1] +; CHECK-GI-NEXT: bfi w9, w8, #1, #31 +; CHECK-GI-NEXT: bfi w13, w10, #1, #31 +; CHECK-GI-NEXT: and w14, w14, #0x1 +; CHECK-GI-NEXT: umov.b w8, v1[0] +; CHECK-GI-NEXT: umov.b w10, v1[2] +; CHECK-GI-NEXT: and w15, w15, #0x1 +; CHECK-GI-NEXT: orr w13, w13, w14, lsl #2 +; CHECK-GI-NEXT: umov.b w14, v1[3] +; CHECK-GI-NEXT: and w11, w11, #0x1 +; CHECK-GI-NEXT: and w0, w0, #0x1 +; CHECK-GI-NEXT: and w16, w16, #0x1 +; CHECK-GI-NEXT: orr w9, w9, w11, lsl #2 +; CHECK-GI-NEXT: orr w13, w13, w15, lsl #3 +; CHECK-GI-NEXT: umov.b w15, v1[4] +; CHECK-GI-NEXT: umov.b w11, v0[6] +; CHECK-GI-NEXT: bfi w8, w0, #1, #31 +; CHECK-GI-NEXT: and w10, w10, #0x1 +; CHECK-GI-NEXT: and w17, w17, #0x1 +; CHECK-GI-NEXT: orr w13, w13, w16, lsl #4 +; CHECK-GI-NEXT: and w14, w14, #0x1 +; CHECK-GI-NEXT: umov.b w0, v0[7] +; CHECK-GI-NEXT: orr w8, w8, w10, lsl #2 +; CHECK-GI-NEXT: umov.b w10, v1[5] +; CHECK-GI-NEXT: umov.b w16, v1[6] +; CHECK-GI-NEXT: orr w13, w13, w17, lsl #5 +; CHECK-GI-NEXT: umov.b w17, v0[4] +; CHECK-GI-NEXT: and w15, w15, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w14, lsl #3 +; CHECK-GI-NEXT: and w12, w12, #0x1 +; CHECK-GI-NEXT: and w11, w11, #0x1 +; CHECK-GI-NEXT: umov.b w14, v1[7] +; CHECK-GI-NEXT: orr w9, w9, w12, lsl #3 +; CHECK-GI-NEXT: orr w11, w13, w11, lsl #6 +; CHECK-GI-NEXT: orr w8, w8, w15, lsl #4 +; CHECK-GI-NEXT: umov.b w15, v0[5] +; CHECK-GI-NEXT: and w10, w10, #0x1 +; CHECK-GI-NEXT: and w0, w0, #0x1 +; CHECK-GI-NEXT: and w12, w17, #0x1 +; CHECK-GI-NEXT: umov.b w13, v0[1] +; CHECK-GI-NEXT: orr w8, w8, w10, lsl #5 +; CHECK-GI-NEXT: and w16, w16, #0x1 +; CHECK-GI-NEXT: orr w9, w9, w12, lsl #4 +; CHECK-GI-NEXT: umov.b w10, v0[0] +; CHECK-GI-NEXT: orr w11, w11, w0, lsl #7 +; CHECK-GI-NEXT: and w14, w14, #0x1 +; CHECK-GI-NEXT: and w12, w15, #0x1 +; CHECK-GI-NEXT: umov.b w15, v0[2] +; CHECK-GI-NEXT: orr w8, w8, w16, lsl #6 +; CHECK-GI-NEXT: orr w9, w9, w12, lsl #5 +; CHECK-GI-NEXT: umov.b w12, v0[6] +; CHECK-GI-NEXT: strb w11, [sp, #8] +; CHECK-GI-NEXT: and w11, w13, #0x1 +; CHECK-GI-NEXT: umov.b w13, v0[3] +; CHECK-GI-NEXT: orr w8, w8, w14, lsl #7 +; CHECK-GI-NEXT: umov.b w14, v0[7] +; CHECK-GI-NEXT: ldr b0, [sp, #8] +; CHECK-GI-NEXT: bfi w10, w11, #1, #31 +; CHECK-GI-NEXT: and w11, w15, #0x1 +; CHECK-GI-NEXT: strb w8, [sp, #9] +; CHECK-GI-NEXT: umov.b w15, v0[4] +; CHECK-GI-NEXT: and w8, w12, #0x1 +; CHECK-GI-NEXT: orr w10, w10, w11, lsl #2 +; CHECK-GI-NEXT: orr w8, w9, w8, lsl #6 +; CHECK-GI-NEXT: and w9, w13, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[1] +; CHECK-GI-NEXT: orr w9, w10, w9, lsl #3 +; CHECK-GI-NEXT: umov.b w10, v0[5] +; CHECK-GI-NEXT: umov.b w12, v0[0] +; CHECK-GI-NEXT: and w13, w14, #0x1 +; CHECK-GI-NEXT: umov.b w16, v0[2] +; CHECK-GI-NEXT: umov.b w17, v0[3] +; CHECK-GI-NEXT: and w14, w15, #0x1 +; CHECK-GI-NEXT: umov.b w15, v0[2] +; CHECK-GI-NEXT: orr w8, w8, w13, lsl #7 +; CHECK-GI-NEXT: orr w9, w9, w14, lsl #4 +; CHECK-GI-NEXT: umov.b w13, v0[6] +; CHECK-GI-NEXT: and w11, w11, #0x1 +; CHECK-GI-NEXT: umov.b w14, v0[3] +; CHECK-GI-NEXT: strb w8, [sp, #10] +; CHECK-GI-NEXT: and w8, w10, #0x1 +; CHECK-GI-NEXT: bfi w12, w11, #1, #31 +; CHECK-GI-NEXT: orr w8, w9, w8, lsl #5 +; CHECK-GI-NEXT: umov.b w10, v0[4] +; CHECK-GI-NEXT: and w9, w15, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[7] +; CHECK-GI-NEXT: umov.b w15, v0[1] +; CHECK-GI-NEXT: orr w9, w12, w9, lsl #2 +; CHECK-GI-NEXT: umov.b w12, v0[5] +; CHECK-GI-NEXT: and w13, w13, #0x1 +; CHECK-GI-NEXT: and w14, w14, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w13, lsl #6 +; CHECK-GI-NEXT: umov.b w13, v0[0] +; CHECK-GI-NEXT: orr w9, w9, w14, lsl #3 +; CHECK-GI-NEXT: and w10, w10, #0x1 +; CHECK-GI-NEXT: umov.b w14, v0[6] +; CHECK-GI-NEXT: and w11, w11, #0x1 +; CHECK-GI-NEXT: and w15, w15, #0x1 +; CHECK-GI-NEXT: umov.b w0, v0[3] +; CHECK-GI-NEXT: orr w9, w9, w10, lsl #4 +; CHECK-GI-NEXT: and w10, w12, #0x1 +; CHECK-GI-NEXT: umov.b w12, v0[7] +; CHECK-GI-NEXT: orr w8, w8, w11, lsl #7 +; CHECK-GI-NEXT: bfi w13, w15, #1, #31 +; CHECK-GI-NEXT: and w11, w16, #0x1 +; CHECK-GI-NEXT: orr w9, w9, w10, lsl #5 +; CHECK-GI-NEXT: and w10, w14, #0x1 +; CHECK-GI-NEXT: umov.b w14, v0[4] +; CHECK-GI-NEXT: strb w8, [sp, #11] +; CHECK-GI-NEXT: umov.b w15, v0[1] +; CHECK-GI-NEXT: umov.b w16, v0[3] +; CHECK-GI-NEXT: orr w8, w9, w10, lsl #6 +; CHECK-GI-NEXT: orr w9, w13, w11, lsl #2 +; CHECK-GI-NEXT: and w10, w12, #0x1 +; CHECK-GI-NEXT: and w11, w17, #0x1 +; CHECK-GI-NEXT: umov.b w12, v0[5] +; CHECK-GI-NEXT: umov.b w17, v0[0] +; CHECK-GI-NEXT: orr w8, w8, w10, lsl #7 +; CHECK-GI-NEXT: orr w9, w9, w11, lsl #3 +; CHECK-GI-NEXT: umov.b w10, v0[1] +; CHECK-GI-NEXT: and w11, w14, #0x1 +; CHECK-GI-NEXT: umov.b w14, v0[0] +; CHECK-GI-NEXT: and w15, w15, #0x1 +; CHECK-GI-NEXT: orr w9, w9, w11, lsl #4 +; CHECK-GI-NEXT: umov.b w11, v0[2] +; CHECK-GI-NEXT: umov.b w13, v0[6] +; CHECK-GI-NEXT: and w12, w12, #0x1 +; CHECK-GI-NEXT: bfi w17, w15, #1, #31 +; CHECK-GI-NEXT: umov.b w15, v0[5] +; CHECK-GI-NEXT: orr w9, w9, w12, lsl #5 +; CHECK-GI-NEXT: and w10, w10, #0x1 +; CHECK-GI-NEXT: umov.b w12, v0[2] +; CHECK-GI-NEXT: bfi w14, w10, #1, #31 +; CHECK-GI-NEXT: umov.b w10, v0[4] +; CHECK-GI-NEXT: ldr b1, [sp, #9] +; CHECK-GI-NEXT: and w11, w11, #0x1 +; CHECK-GI-NEXT: and w13, w13, #0x1 +; CHECK-GI-NEXT: strb w8, [sp, #12] +; CHECK-GI-NEXT: orr w11, w14, w11, lsl #2 +; CHECK-GI-NEXT: and w14, w16, #0x1 +; CHECK-GI-NEXT: umov.b w16, v0[4] +; CHECK-GI-NEXT: and w12, w12, #0x1 +; CHECK-GI-NEXT: and w15, w15, #0x1 +; CHECK-GI-NEXT: orr w9, w9, w13, lsl #6 +; CHECK-GI-NEXT: orr w11, w11, w14, lsl #3 +; CHECK-GI-NEXT: orr w12, w17, w12, lsl #2 +; CHECK-GI-NEXT: and w10, w10, #0x1 +; CHECK-GI-NEXT: and w17, w0, #0x1 +; CHECK-GI-NEXT: umov.b w0, v0[5] +; CHECK-GI-NEXT: umov.b w14, v0[6] +; CHECK-GI-NEXT: orr w10, w11, w10, lsl #4 +; CHECK-GI-NEXT: orr w12, w12, w17, lsl #3 +; CHECK-GI-NEXT: umov.b w11, v0[7] +; CHECK-GI-NEXT: and w16, w16, #0x1 +; CHECK-GI-NEXT: umov.b w17, v0[6] +; CHECK-GI-NEXT: orr w10, w10, w15, lsl #5 +; CHECK-GI-NEXT: umov.b w15, v0[7] +; CHECK-GI-NEXT: orr w12, w12, w16, lsl #4 +; CHECK-GI-NEXT: and w16, w0, #0x1 +; CHECK-GI-NEXT: umov.b w0, v0[7] +; CHECK-GI-NEXT: and w14, w14, #0x1 +; CHECK-GI-NEXT: orr w12, w12, w16, lsl #5 +; CHECK-GI-NEXT: orr w10, w10, w14, lsl #6 +; CHECK-GI-NEXT: and w11, w11, #0x1 +; CHECK-GI-NEXT: and w13, w17, #0x1 +; CHECK-GI-NEXT: orr w9, w9, w11, lsl #7 +; CHECK-GI-NEXT: mov.s v0[1], v1[0] +; CHECK-GI-NEXT: orr w11, w12, w13, lsl #6 +; CHECK-GI-NEXT: and w12, w15, #0x1 +; CHECK-GI-NEXT: ; kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: orr w8, w10, w12, lsl #7 +; CHECK-GI-NEXT: and w10, w0, #0x1 +; CHECK-GI-NEXT: strb w9, [sp, #13] +; CHECK-GI-NEXT: orr w9, w11, w10, lsl #7 +; CHECK-GI-NEXT: strb w8, [sp, #14] +; CHECK-GI-NEXT: strb w9, [sp, #15] +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %bc = bitcast <16 x i1> %arg to <2 x i8> ret <2 x i8> %bc }